diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 666d8844a80..e27361ab263 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -22,13 +22,13 @@ on:
         default: nightly
 
 concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
+  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name }}
   cancel-in-progress: true
 
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -37,7 +37,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-23.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-23.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -46,7 +46,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-23.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-23.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -57,7 +57,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-23.10
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-23.12
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -69,9 +69,10 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build-cudf:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-23.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-23.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
+      build-2_28-wheels: "true"
       branch: ${{ inputs.branch }}
       sha: ${{ inputs.sha }}
       date: ${{ inputs.date }}
@@ -79,7 +80,7 @@ jobs:
   wheel-publish-cudf:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-23.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-23.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -89,7 +90,7 @@ jobs:
   wheel-build-dask-cudf:
     needs: wheel-publish-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-23.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-23.12
     with:
       matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.10" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.0.1")))
       build_type: ${{ inputs.build_type || 'branch' }}
@@ -100,7 +101,7 @@ jobs:
   wheel-publish-dask-cudf:
     needs: wheel-build-dask-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-23.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-23.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml
index 23956a02fbd..31e78f82a62 100644
--- a/.github/workflows/labeler.yml
+++ b/.github/workflows/labeler.yml
@@ -6,6 +6,6 @@ jobs:
   triage:
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/labeler@main
+    - uses: actions/labeler@v4
       with:
         repo-token: "${{ secrets.GITHUB_TOKEN }}"
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index abf5fcf2f33..40cf0dcd2c1 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -30,34 +30,34 @@ jobs:
       #- pandas-tests-diff
       #- pandas-tests-diff-comment
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-23.10
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-23.12
   checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-23.10
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-23.12
     with:
       enable_check_generated_files: false
   conda-cpp-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.12
     with:
       build_type: pull-request
   conda-cpp-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.12
     with:
       build_type: pull-request
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-23.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-23.12
     with:
       build_type: pull-request
   conda-python-cudf-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-23.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-23.12
     with:
       build_type: pull-request
       test_script: "ci/test_python_cudf.sh"
@@ -65,14 +65,14 @@ jobs:
     # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-23.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-23.12
     with:
       build_type: pull-request
       test_script: "ci/test_python_other.sh"
   conda-java-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-23.10
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-23.12
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -82,7 +82,7 @@ jobs:
   conda-notebook-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-23.10
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-23.12
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -92,7 +92,7 @@ jobs:
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-23.10
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-23.12
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -102,21 +102,22 @@ jobs:
   wheel-build-cudf:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-23.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-23.12
     with:
       build_type: pull-request
+      build-2_28-wheels: "true"
       script: "ci/build_wheel_cudf.sh"
   wheel-tests-cudf:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-23.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-23.12
     with:
       build_type: pull-request
       script: ci/test_wheel_cudf.sh
   wheel-build-dask-cudf:
     needs: wheel-tests-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-23.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-23.12
     with:
       matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.10" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.0.1")))
       build_type: pull-request
@@ -124,7 +125,7 @@ jobs:
   wheel-tests-dask-cudf:
     needs: wheel-build-dask-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-23.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-23.12
     with:
       matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.10" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.0.1")))
       build_type: pull-request
@@ -132,7 +133,7 @@ jobs:
   unit-tests-cudf-pandas:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-23.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-23.12
     with:
       matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.10" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.0.1")))
       build_type: pull-request
@@ -141,7 +142,7 @@ jobs:
     # run the Pandas unit tests using PR branch
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-23.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-23.12
     with:
       matrix_filter: map(select(.ARCH == "amd64")) | max_by(.CUDA_VER) | [.]
       build_type: pull-request
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 49a9c73d026..0d4401160e1 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-cpp-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -24,7 +24,7 @@ jobs:
       sha: ${{ inputs.sha }}
   conda-cpp-memcheck-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-23.10
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-23.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -36,7 +36,7 @@ jobs:
       run_script: "ci/test_cpp_memcheck.sh"
   conda-python-cudf-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-23.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-23.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -46,7 +46,7 @@ jobs:
   conda-python-other-tests:
     # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-23.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-23.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -55,7 +55,7 @@ jobs:
       test_script: "ci/test_python_other.sh"
   conda-java-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-23.10
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-23.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -67,7 +67,7 @@ jobs:
       run_script: "ci/test_java.sh"
   conda-notebook-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-23.10
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-23.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -79,7 +79,7 @@ jobs:
       run_script: "ci/test_notebooks.sh"
   wheel-tests-cudf:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-23.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-23.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -88,7 +88,7 @@ jobs:
       script: ci/test_wheel_cudf.sh
   wheel-tests-dask-cudf:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-23.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-23.12
     with:
       matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.10" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.0.1")))
       build_type: nightly
@@ -97,9 +97,8 @@ jobs:
       sha: ${{ inputs.sha }}
       script: ci/test_wheel_dask_cudf.sh
   unit-tests-cudf-pandas:
-    needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-23.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-23.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -109,7 +108,7 @@ jobs:
   pandas-tests:
     # run the Pandas unit tests
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-23.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-23.12
     with:
       matrix_filter: map(select(.ARCH == "amd64")) | max_by(.CUDA_VER) | [.]
       build_type: nightly
diff --git a/CHANGELOG.md b/CHANGELOG.md
index ecd547ab5b3..3cb6caa25ee 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,140 @@
+# cuDF 23.12.00 (6 Dec 2023)
+
+## 🚨 Breaking Changes
+
+- Raise error in `reindex` when `index` is not unique ([#14400](https://github.com/rapidsai/cudf/pull/14400)) [@galipremsagar](https://github.com/galipremsagar)
+- Expose stream parameter to get_json_object API ([#14297](https://github.com/rapidsai/cudf/pull/14297)) [@davidwendt](https://github.com/davidwendt)
+- Refactor cudf_kafka to use skbuild ([#14292](https://github.com/rapidsai/cudf/pull/14292)) [@jdye64](https://github.com/jdye64)
+- Expose stream parameter in public strings convert APIs ([#14255](https://github.com/rapidsai/cudf/pull/14255)) [@davidwendt](https://github.com/davidwendt)
+- Upgrade to nvCOMP 3.0.4 ([#13815](https://github.com/rapidsai/cudf/pull/13815)) [@vuule](https://github.com/vuule)
+
+## 🐛 Bug Fixes
+
+- Update actions/labeler to v4 ([#14562](https://github.com/rapidsai/cudf/pull/14562)) [@raydouglass](https://github.com/raydouglass)
+- Fix data corruption when skipping rows ([#14557](https://github.com/rapidsai/cudf/pull/14557)) [@etseidl](https://github.com/etseidl)
+- Fix function name typo in `cudf.pandas` profiler ([#14514](https://github.com/rapidsai/cudf/pull/14514)) [@galipremsagar](https://github.com/galipremsagar)
+- Fix intermediate type checking in expression parsing ([#14445](https://github.com/rapidsai/cudf/pull/14445)) [@vyasr](https://github.com/vyasr)
+- Forward merge `branch-23.10` into `branch-23.12` ([#14435](https://github.com/rapidsai/cudf/pull/14435)) [@raydouglass](https://github.com/raydouglass)
+- Remove needs: wheel-build-cudf. ([#14427](https://github.com/rapidsai/cudf/pull/14427)) [@bdice](https://github.com/bdice)
+- Fix dask dependency in custreamz ([#14420](https://github.com/rapidsai/cudf/pull/14420)) [@vyasr](https://github.com/vyasr)
+- Ensure nvbench initializes nvml context when built statically ([#14411](https://github.com/rapidsai/cudf/pull/14411)) [@robertmaynard](https://github.com/robertmaynard)
+- Support java AST String literal with desired encoding ([#14402](https://github.com/rapidsai/cudf/pull/14402)) [@winningsix](https://github.com/winningsix)
+- Raise error in `reindex` when `index` is not unique ([#14400](https://github.com/rapidsai/cudf/pull/14400)) [@galipremsagar](https://github.com/galipremsagar)
+- Always build nvbench statically so we don&#39;t need to package it ([#14399](https://github.com/rapidsai/cudf/pull/14399)) [@robertmaynard](https://github.com/robertmaynard)
+- Fix token-count logic in nvtext::tokenize_with_vocabulary ([#14393](https://github.com/rapidsai/cudf/pull/14393)) [@davidwendt](https://github.com/davidwendt)
+- Fix as_column(pd.Timestamp/Timedelta, length=) not respecting length ([#14390](https://github.com/rapidsai/cudf/pull/14390)) [@mroeschke](https://github.com/mroeschke)
+- cudf.pandas: cuDF subpath checking in module `__getattr__` ([#14388](https://github.com/rapidsai/cudf/pull/14388)) [@shwina](https://github.com/shwina)
+- Fix and disable encoding for nanosecond statistics in ORC writer ([#14367](https://github.com/rapidsai/cudf/pull/14367)) [@vuule](https://github.com/vuule)
+- Add the new manylinux builds to the build job ([#14351](https://github.com/rapidsai/cudf/pull/14351)) [@vyasr](https://github.com/vyasr)
+- cudf jit parser now supports .pragma instructions with quotes ([#14348](https://github.com/rapidsai/cudf/pull/14348)) [@robertmaynard](https://github.com/robertmaynard)
+- Fix overflow check in `cudf::merge` ([#14345](https://github.com/rapidsai/cudf/pull/14345)) [@divyegala](https://github.com/divyegala)
+- Add cramjam ([#14344](https://github.com/rapidsai/cudf/pull/14344)) [@vyasr](https://github.com/vyasr)
+- Enable `dask_cudf/io` pytests in CI ([#14338](https://github.com/rapidsai/cudf/pull/14338)) [@galipremsagar](https://github.com/galipremsagar)
+- Temporarily avoid the current build of pydata-sphinx-theme ([#14332](https://github.com/rapidsai/cudf/pull/14332)) [@vyasr](https://github.com/vyasr)
+- Fix host buffer access from device function in the Parquet reader ([#14328](https://github.com/rapidsai/cudf/pull/14328)) [@vuule](https://github.com/vuule)
+- Run IO tests for Dask-cuDF ([#14327](https://github.com/rapidsai/cudf/pull/14327)) [@rjzamora](https://github.com/rjzamora)
+- Fix logical type issues in the Parquet writer ([#14322](https://github.com/rapidsai/cudf/pull/14322)) [@vuule](https://github.com/vuule)
+- Remove aws-sdk-pinning and revert to arrow 12.0.1 ([#14319](https://github.com/rapidsai/cudf/pull/14319)) [@vyasr](https://github.com/vyasr)
+- test is_valid before reading column data ([#14318](https://github.com/rapidsai/cudf/pull/14318)) [@etseidl](https://github.com/etseidl)
+- Fix gtest validity setting for TextTokenizeTest.Vocabulary ([#14312](https://github.com/rapidsai/cudf/pull/14312)) [@davidwendt](https://github.com/davidwendt)
+- Fixes stack context for json lines format that recovers from invalid JSON lines ([#14309](https://github.com/rapidsai/cudf/pull/14309)) [@elstehle](https://github.com/elstehle)
+- Downgrade to Arrow 12.0.0 for aws-sdk-cpp and fix cudf_kafka builds for new CI containers ([#14296](https://github.com/rapidsai/cudf/pull/14296)) [@vyasr](https://github.com/vyasr)
+- fixing thread index overflow issue ([#14290](https://github.com/rapidsai/cudf/pull/14290)) [@hyperbolic2346](https://github.com/hyperbolic2346)
+- Fix memset error in nvtext::edit_distance_matrix ([#14283](https://github.com/rapidsai/cudf/pull/14283)) [@davidwendt](https://github.com/davidwendt)
+- Changes JSON reader&#39;s recovery option&#39;s behaviour to ignore all characters after a valid JSON record ([#14279](https://github.com/rapidsai/cudf/pull/14279)) [@elstehle](https://github.com/elstehle)
+- Handle empty string correctly in Parquet statistics ([#14257](https://github.com/rapidsai/cudf/pull/14257)) [@etseidl](https://github.com/etseidl)
+- Fixes behaviour for incomplete lines when `recover_with_nulls` is enabled ([#14252](https://github.com/rapidsai/cudf/pull/14252)) [@elstehle](https://github.com/elstehle)
+- cudf::detail::pinned_allocator doesn&#39;t throw from `deallocate` ([#14251](https://github.com/rapidsai/cudf/pull/14251)) [@robertmaynard](https://github.com/robertmaynard)
+- Fix strings replace for adjacent, identical multi-byte UTF-8 character targets ([#14235](https://github.com/rapidsai/cudf/pull/14235)) [@davidwendt](https://github.com/davidwendt)
+- Fix the precision when converting a decimal128 column to an arrow array ([#14230](https://github.com/rapidsai/cudf/pull/14230)) [@jihoonson](https://github.com/jihoonson)
+- Fixing parquet list of struct interpretation ([#13715](https://github.com/rapidsai/cudf/pull/13715)) [@hyperbolic2346](https://github.com/hyperbolic2346)
+
+## 📖 Documentation
+
+- Fix io reference in docs. ([#14452](https://github.com/rapidsai/cudf/pull/14452)) [@bdice](https://github.com/bdice)
+- Update README ([#14374](https://github.com/rapidsai/cudf/pull/14374)) [@shwina](https://github.com/shwina)
+- Example code for blog on new row comparators ([#13795](https://github.com/rapidsai/cudf/pull/13795)) [@divyegala](https://github.com/divyegala)
+
+## 🚀 New Features
+
+- Expose streams in public unary APIs ([#14342](https://github.com/rapidsai/cudf/pull/14342)) [@vyasr](https://github.com/vyasr)
+- Add python tests for Parquet DELTA_BINARY_PACKED encoder ([#14316](https://github.com/rapidsai/cudf/pull/14316)) [@etseidl](https://github.com/etseidl)
+- Update rapids-cmake functions to non-deprecated signatures ([#14265](https://github.com/rapidsai/cudf/pull/14265)) [@robertmaynard](https://github.com/robertmaynard)
+- Expose streams in public null mask APIs ([#14263](https://github.com/rapidsai/cudf/pull/14263)) [@vyasr](https://github.com/vyasr)
+- Expose streams in binaryop APIs ([#14187](https://github.com/rapidsai/cudf/pull/14187)) [@vyasr](https://github.com/vyasr)
+- Add pylibcudf.Scalar that interoperates with Arrow scalars ([#14133](https://github.com/rapidsai/cudf/pull/14133)) [@vyasr](https://github.com/vyasr)
+- Add decoder for DELTA_BYTE_ARRAY to Parquet reader ([#14101](https://github.com/rapidsai/cudf/pull/14101)) [@etseidl](https://github.com/etseidl)
+- Add DELTA_BINARY_PACKED encoder for Parquet writer ([#14100](https://github.com/rapidsai/cudf/pull/14100)) [@etseidl](https://github.com/etseidl)
+- Add BytePairEncoder class to cuDF ([#13891](https://github.com/rapidsai/cudf/pull/13891)) [@davidwendt](https://github.com/davidwendt)
+- Upgrade to nvCOMP 3.0.4 ([#13815](https://github.com/rapidsai/cudf/pull/13815)) [@vuule](https://github.com/vuule)
+- Use `pynvjitlink` for CUDA 12+ MVC ([#13650](https://github.com/rapidsai/cudf/pull/13650)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+
+## 🛠️ Improvements
+
+- Build concurrency for nightly and merge triggers ([#14441](https://github.com/rapidsai/cudf/pull/14441)) [@bdice](https://github.com/bdice)
+- Cleanup remaining usages of dask dependencies ([#14407](https://github.com/rapidsai/cudf/pull/14407)) [@galipremsagar](https://github.com/galipremsagar)
+- Update to Arrow 14.0.1. ([#14387](https://github.com/rapidsai/cudf/pull/14387)) [@bdice](https://github.com/bdice)
+- Remove Cython libcpp wrappers ([#14382](https://github.com/rapidsai/cudf/pull/14382)) [@vyasr](https://github.com/vyasr)
+- Forward-merge branch-23.10 to branch-23.12 ([#14372](https://github.com/rapidsai/cudf/pull/14372)) [@bdice](https://github.com/bdice)
+- Upgrade to arrow 14 ([#14371](https://github.com/rapidsai/cudf/pull/14371)) [@galipremsagar](https://github.com/galipremsagar)
+- Fix a pytest typo in `test_kurt_skew_error` ([#14368](https://github.com/rapidsai/cudf/pull/14368)) [@galipremsagar](https://github.com/galipremsagar)
+- Use new rapids-dask-dependency metapackage for managing dask versions ([#14364](https://github.com/rapidsai/cudf/pull/14364)) [@vyasr](https://github.com/vyasr)
+- Change `nullable()` to `has_nulls()` in `cudf::detail::gather` ([#14363](https://github.com/rapidsai/cudf/pull/14363)) [@divyegala](https://github.com/divyegala)
+- Split up scan_inclusive.cu to improve its compile time ([#14358](https://github.com/rapidsai/cudf/pull/14358)) [@davidwendt](https://github.com/davidwendt)
+- Implement user_datasource_wrapper is_empty() and is_device_read_preferred(). ([#14357](https://github.com/rapidsai/cudf/pull/14357)) [@tpn](https://github.com/tpn)
+- Added streams to CSV reader and writer api ([#14340](https://github.com/rapidsai/cudf/pull/14340)) [@shrshi](https://github.com/shrshi)
+- Upgrade wheels to use arrow 13 ([#14339](https://github.com/rapidsai/cudf/pull/14339)) [@vyasr](https://github.com/vyasr)
+- Rework nvtext::byte_pair_encoding API ([#14337](https://github.com/rapidsai/cudf/pull/14337)) [@davidwendt](https://github.com/davidwendt)
+- Improve performance of nvtext::tokenize_with_vocabulary for long strings ([#14336](https://github.com/rapidsai/cudf/pull/14336)) [@davidwendt](https://github.com/davidwendt)
+- Upgrade `arrow` to `13` ([#14330](https://github.com/rapidsai/cudf/pull/14330)) [@galipremsagar](https://github.com/galipremsagar)
+- Expose stream parameter in public nvtext replace APIs ([#14329](https://github.com/rapidsai/cudf/pull/14329)) [@davidwendt](https://github.com/davidwendt)
+- Drop `pyorc` dependency and use `pandas`/`pyarrow` instead ([#14323](https://github.com/rapidsai/cudf/pull/14323)) [@galipremsagar](https://github.com/galipremsagar)
+- Avoid `pyarrow.fs` import for local storage ([#14321](https://github.com/rapidsai/cudf/pull/14321)) [@rjzamora](https://github.com/rjzamora)
+- Unpin `dask` and `distributed` for `23.12` development ([#14320](https://github.com/rapidsai/cudf/pull/14320)) [@galipremsagar](https://github.com/galipremsagar)
+- Expose stream parameter in public nvtext tokenize APIs ([#14317](https://github.com/rapidsai/cudf/pull/14317)) [@davidwendt](https://github.com/davidwendt)
+- Added streams to JSON reader and writer api ([#14313](https://github.com/rapidsai/cudf/pull/14313)) [@shrshi](https://github.com/shrshi)
+- Minor improvements in `source_info` ([#14308](https://github.com/rapidsai/cudf/pull/14308)) [@vuule](https://github.com/vuule)
+- Forward-merge branch-23.10 to branch-23.12 ([#14307](https://github.com/rapidsai/cudf/pull/14307)) [@bdice](https://github.com/bdice)
+- Add stream parameter to Set Operations (Public List APIs) ([#14305](https://github.com/rapidsai/cudf/pull/14305)) [@SurajAralihalli](https://github.com/SurajAralihalli)
+- Expose stream parameter to get_json_object API ([#14297](https://github.com/rapidsai/cudf/pull/14297)) [@davidwendt](https://github.com/davidwendt)
+- Sort dictionary data alphabetically in the ORC writer ([#14295](https://github.com/rapidsai/cudf/pull/14295)) [@vuule](https://github.com/vuule)
+- Expose stream parameter in public strings filter APIs ([#14293](https://github.com/rapidsai/cudf/pull/14293)) [@davidwendt](https://github.com/davidwendt)
+- Refactor cudf_kafka to use skbuild ([#14292](https://github.com/rapidsai/cudf/pull/14292)) [@jdye64](https://github.com/jdye64)
+- Update `shared-action-workflows` references ([#14289](https://github.com/rapidsai/cudf/pull/14289)) [@AyodeAwe](https://github.com/AyodeAwe)
+- Register ``partd`` encode dispatch in ``dask_cudf`` ([#14287](https://github.com/rapidsai/cudf/pull/14287)) [@rjzamora](https://github.com/rjzamora)
+- Update versioning strategy ([#14285](https://github.com/rapidsai/cudf/pull/14285)) [@vyasr](https://github.com/vyasr)
+- Move and rename byte-pair-encoding source files ([#14284](https://github.com/rapidsai/cudf/pull/14284)) [@davidwendt](https://github.com/davidwendt)
+- Expose stream parameter in public strings combine APIs ([#14281](https://github.com/rapidsai/cudf/pull/14281)) [@davidwendt](https://github.com/davidwendt)
+- Expose stream parameter in public strings contains APIs ([#14280](https://github.com/rapidsai/cudf/pull/14280)) [@davidwendt](https://github.com/davidwendt)
+- Add stream parameter to List Sort and Filter APIs ([#14272](https://github.com/rapidsai/cudf/pull/14272)) [@SurajAralihalli](https://github.com/SurajAralihalli)
+- Use branch-23.12 workflows. ([#14271](https://github.com/rapidsai/cudf/pull/14271)) [@bdice](https://github.com/bdice)
+- Refactor LogicalType for Parquet ([#14264](https://github.com/rapidsai/cudf/pull/14264)) [@etseidl](https://github.com/etseidl)
+- Centralize chunked reading code in the parquet reader to reader_impl_chunking.cu ([#14262](https://github.com/rapidsai/cudf/pull/14262)) [@nvdbaranec](https://github.com/nvdbaranec)
+- Expose stream parameter in public strings replace APIs ([#14261](https://github.com/rapidsai/cudf/pull/14261)) [@davidwendt](https://github.com/davidwendt)
+- Expose stream parameter in public strings APIs ([#14260](https://github.com/rapidsai/cudf/pull/14260)) [@davidwendt](https://github.com/davidwendt)
+- Cleanup of namespaces in parquet code. ([#14259](https://github.com/rapidsai/cudf/pull/14259)) [@nvdbaranec](https://github.com/nvdbaranec)
+- Make parquet schema index type consistent ([#14256](https://github.com/rapidsai/cudf/pull/14256)) [@hyperbolic2346](https://github.com/hyperbolic2346)
+- Expose stream parameter in public strings convert APIs ([#14255](https://github.com/rapidsai/cudf/pull/14255)) [@davidwendt](https://github.com/davidwendt)
+- Add in java bindings for DataSource ([#14254](https://github.com/rapidsai/cudf/pull/14254)) [@revans2](https://github.com/revans2)
+- Reimplement `cudf::merge` for nested types without using comparators ([#14250](https://github.com/rapidsai/cudf/pull/14250)) [@divyegala](https://github.com/divyegala)
+- Add stream parameter to List Manipulation and Operations APIs ([#14248](https://github.com/rapidsai/cudf/pull/14248)) [@SurajAralihalli](https://github.com/SurajAralihalli)
+- Expose stream parameter in public strings split/partition APIs ([#14247](https://github.com/rapidsai/cudf/pull/14247)) [@davidwendt](https://github.com/davidwendt)
+- Improve `contains_column` by invoking `contains_table` ([#14238](https://github.com/rapidsai/cudf/pull/14238)) [@PointKernel](https://github.com/PointKernel)
+- Detect and report errors in Parquet header parsing ([#14237](https://github.com/rapidsai/cudf/pull/14237)) [@etseidl](https://github.com/etseidl)
+- Normalizing offsets iterator ([#14234](https://github.com/rapidsai/cudf/pull/14234)) [@davidwendt](https://github.com/davidwendt)
+- Forward merge `23.10` into `23.12` ([#14231](https://github.com/rapidsai/cudf/pull/14231)) [@galipremsagar](https://github.com/galipremsagar)
+- Return error if BOOL8 column-type is used with integers-to-hex ([#14208](https://github.com/rapidsai/cudf/pull/14208)) [@davidwendt](https://github.com/davidwendt)
+- Enable indexalator for device code ([#14206](https://github.com/rapidsai/cudf/pull/14206)) [@davidwendt](https://github.com/davidwendt)
+- Marginally reduce memory footprint of joins ([#14197](https://github.com/rapidsai/cudf/pull/14197)) [@wence-](https://github.com/wence-)
+- Add nvtx annotations to spilling-based data movement ([#14196](https://github.com/rapidsai/cudf/pull/14196)) [@wence-](https://github.com/wence-)
+- Optimize ORC writer for decimal columns ([#14190](https://github.com/rapidsai/cudf/pull/14190)) [@vuule](https://github.com/vuule)
+- Remove the use of volatile in ORC ([#14175](https://github.com/rapidsai/cudf/pull/14175)) [@vuule](https://github.com/vuule)
+- Add `bytes_per_second` to distinct_count of stream_compaction nvbench. ([#14172](https://github.com/rapidsai/cudf/pull/14172)) [@Blonck](https://github.com/Blonck)
+- Add `bytes_per_second` to transpose benchmark ([#14170](https://github.com/rapidsai/cudf/pull/14170)) [@Blonck](https://github.com/Blonck)
+- cuDF: Build CUDA 12.0 ARM conda packages. ([#14112](https://github.com/rapidsai/cudf/pull/14112)) [@bdice](https://github.com/bdice)
+- Add `bytes_per_second` to shift benchmark ([#13950](https://github.com/rapidsai/cudf/pull/13950)) [@Blonck](https://github.com/Blonck)
+- Extract `debug_utilities.hpp/cu` from `column_utilities.hpp/cu` ([#13720](https://github.com/rapidsai/cudf/pull/13720)) [@ttnghia](https://github.com/ttnghia)
+
 # cuDF 23.10.00 (11 Oct 2023)
 
 ## 🚨 Breaking Changes
diff --git a/README.md b/README.md
index 64c980d0cb3..677cfc89d52 100644
--- a/README.md
+++ b/README.md
@@ -1,57 +1,62 @@
 # <div align="left"><img src="img/rapids_logo.png" width="90px"/>&nbsp;cuDF - GPU DataFrames</div>
 
-**NOTE:** For the latest stable [README.md](https://github.com/rapidsai/cudf/blob/main/README.md) ensure you are on the `main` branch.
+## 📢 cuDF can now be used as a no-code-change accelerator for pandas! To learn more, see [here](https://rapids.ai/cudf-pandas/)!
 
-## Resources
-
-- [cuDF Reference Documentation](https://docs.rapids.ai/api/cudf/stable/): Python API reference, tutorials, and topic guides.
-- [libcudf Reference Documentation](https://docs.rapids.ai/api/libcudf/stable/): C/C++ CUDA library API reference.
-- [Getting Started](https://rapids.ai/start.html): Instructions for installing cuDF.
-- [RAPIDS Community](https://rapids.ai/community.html): Get help, contribute, and collaborate.
-- [GitHub repository](https://github.com/rapidsai/cudf): Download the cuDF source code.
-- [Issue tracker](https://github.com/rapidsai/cudf/issues): Report issues or request features.
-
-## Overview
-
-Built based on the [Apache Arrow](http://arrow.apache.org/) columnar memory format, cuDF is a GPU DataFrame library for loading, joining, aggregating, filtering, and otherwise manipulating data.
+cuDF is a GPU DataFrame library for loading joining, aggregating,
+filtering, and otherwise manipulating data. cuDF leverages
+[libcudf](https://docs.rapids.ai/api/libcudf/stable/), a
+blazing-fast C++/CUDA dataframe library and the [Apache
+Arrow](https://arrow.apache.org/) columnar format to provide a
+GPU-accelerated pandas API.
 
-cuDF provides a pandas-like API that will be familiar to data engineers & data scientists, so they can use it to easily accelerate their workflows without going into the details of CUDA programming.
+You can import `cudf` directly and use it like `pandas`:
 
-For example, the following snippet downloads a CSV, then uses the GPU to parse it into rows and columns and run calculations:
 ```python
-import cudf, requests
+import cudf
+import requests
 from io import StringIO
 
 url = "https://github.com/plotly/datasets/raw/master/tips.csv"
-content = requests.get(url).content.decode('utf-8')
+content = requests.get(url).content.decode("utf-8")
 
 tips_df = cudf.read_csv(StringIO(content))
-tips_df['tip_percentage'] = tips_df['tip'] / tips_df['total_bill'] * 100
+tips_df["tip_percentage"] = tips_df["tip"] / tips_df["total_bill"] * 100
 
 # display average tip by dining party size
-print(tips_df.groupby('size').tip_percentage.mean())
+print(tips_df.groupby("size").tip_percentage.mean())
 ```
 
-Output:
-```
-size
-1    21.729201548727808
-2    16.571919173482897
-3    15.215685473711837
-4    14.594900639351332
-5    14.149548965142023
-6    15.622920072028379
-Name: tip_percentage, dtype: float64
-```
+Or, you can use cuDF as a no-code-change accelerator for pandas, using
+[`cudf.pandas`](https://docs.rapids.ai/api/cudf/stable/cudf_pandas).
+`cudf.pandas` supports 100% of the pandas API, utilizing cuDF for
+supported operations and falling back to pandas when needed:
 
-For additional examples, browse our complete [API documentation](https://docs.rapids.ai/api/cudf/stable/), or check out our more detailed [notebooks](https://github.com/rapidsai/notebooks-contrib).
+```python
+%load_ext cudf.pandas  # pandas operations now use the GPU!
 
-## Quick Start
+import pandas as pd
+import requests
+from io import StringIO
 
-Please see the [Demo Docker Repository](https://hub.docker.com/r/rapidsai/rapidsai/), choosing a tag based on the NVIDIA CUDA version you're running. This provides a ready to run Docker container with example notebooks and data, showcasing how you can utilize cuDF.
+url = "https://github.com/plotly/datasets/raw/master/tips.csv"
+content = requests.get(url).content.decode("utf-8")
 
-## Installation
+tips_df = pd.read_csv(StringIO(content))
+tips_df["tip_percentage"] = tips_df["tip"] / tips_df["total_bill"] * 100
 
+# display average tip by dining party size
+print(tips_df.groupby("size").tip_percentage.mean())
+```
+
+## Resources
+
+- [Try cudf.pandas now](https://nvda.ws/rapids-cudf): Explore `cudf.pandas` on a free GPU enabled instance on Google Colab!
+- [Install](https://rapids.ai/start.html): Instructions for installing cuDF and other [RAPIDS](https://rapids.ai) libraries.
+- [cudf (Python) documentation](https://docs.rapids.ai/api/cudf/stable/)
+- [libcudf (C++/CUDA) documentation](https://docs.rapids.ai/api/libcudf/stable/)
+- [RAPIDS Community](https://rapids.ai/community.html): Get help, contribute, and collaborate.
+
+## Installation
 
 ### CUDA/GPU requirements
 
@@ -65,7 +70,7 @@ cuDF can be installed with conda (via [miniconda](https://conda.io/miniconda.htm
 
 ```bash
 conda install -c rapidsai -c conda-forge -c nvidia \
-    cudf=23.10 python=3.10 cuda-version=11.8
+    cudf=23.12 python=3.10 cuda-version=11.8
 ```
 
 We also provide [nightly Conda packages](https://anaconda.org/rapidsai-nightly) built from the HEAD
diff --git a/VERSION b/VERSION
new file mode 100644
index 00000000000..a193fff41e8
--- /dev/null
+++ b/VERSION
@@ -0,0 +1 @@
+23.12.00
diff --git a/build.sh b/build.sh
index 2ad69712e5d..e5beb51dedf 100755
--- a/build.sh
+++ b/build.sh
@@ -369,7 +369,7 @@ fi
 # build cudf_kafka Python package
 if hasArg cudf_kafka; then
     cd ${REPODIR}/python/cudf_kafka
-    SKBUILD_CONFIGURE_OPTIONS="-DCMAKE_LIBRARY_PATH=${LIBCUDF_BUILD_DIR}" \
+    SKBUILD_CONFIGURE_OPTIONS="-DCMAKE_PREFIX_PATH=${INSTALL_PREFIX} -DCMAKE_LIBRARY_PATH=${LIBCUDF_BUILD_DIR} ${EXTRA_CMAKE_ARGS}" \
         SKBUILD_BUILD_OPTIONS="-j${PARALLEL_LEVEL:-1}" \
         python -m pip install --no-build-isolation --no-deps .
 fi
diff --git a/ci/build_cpp.sh b/ci/build_cpp.sh
index 8b757fecf5a..f1ad8ee7778 100755
--- a/ci/build_cpp.sh
+++ b/ci/build_cpp.sh
@@ -9,10 +9,12 @@ export CMAKE_GENERATOR=Ninja
 
 rapids-print-env
 
+version=$(rapids-generate-version)
+
 rapids-logger "Begin cpp build"
 
 # With boa installed conda build forward to boa
-rapids-conda-retry mambabuild \
+RAPIDS_PACKAGE_VERSION=${version} rapids-conda-retry mambabuild \
     conda/recipes/libcudf
 
 rapids-upload-conda-to-s3 cpp
diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index 9149b5e6bfe..d5b0c9a5edb 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -25,7 +25,7 @@ rapids-mamba-retry install \
   --channel "${PYTHON_CHANNEL}" \
   libcudf cudf dask-cudf
 
-export RAPIDS_VERSION_NUMBER="23.10"
+export RAPIDS_VERSION_NUMBER="23.12"
 export RAPIDS_DOCS_DIR="$(mktemp -d)"
 
 rapids-logger "Build CPP docs"
diff --git a/ci/build_python.sh b/ci/build_python.sh
index 61f160b25f5..32fe7b6b3ce 100755
--- a/ci/build_python.sh
+++ b/ci/build_python.sh
@@ -9,6 +9,15 @@ export CMAKE_GENERATOR=Ninja
 
 rapids-print-env
 
+package_dir="python"
+version=$(rapids-generate-version)
+commit=$(git rev-parse HEAD)
+
+echo "${version}" > VERSION
+for package_name in cudf dask_cudf cudf_kafka custreamz; do
+    sed -i "/^__git_commit__/ s/= .*/= \"${commit}\"/g" ${package_dir}/${package_name}/${package_name}/_version.py
+done
+
 rapids-logger "Begin py build"
 
 CPP_CHANNEL=$(rapids-download-conda-from-s3 cpp)
@@ -16,24 +25,24 @@ CPP_CHANNEL=$(rapids-download-conda-from-s3 cpp)
 # TODO: Remove `--no-test` flag once importing on a CPU
 # node works correctly
 # With boa installed conda build forwards to the boa builder
-rapids-conda-retry mambabuild \
+RAPIDS_PACKAGE_VERSION=${version} rapids-conda-retry mambabuild \
   --no-test \
   --channel "${CPP_CHANNEL}" \
   conda/recipes/cudf
 
-rapids-conda-retry mambabuild \
+RAPIDS_PACKAGE_VERSION=${version} rapids-conda-retry mambabuild \
   --no-test \
   --channel "${CPP_CHANNEL}" \
   --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \
   conda/recipes/dask-cudf
 
-rapids-conda-retry mambabuild \
+RAPIDS_PACKAGE_VERSION=${version} rapids-conda-retry mambabuild \
   --no-test \
   --channel "${CPP_CHANNEL}" \
   --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \
   conda/recipes/cudf_kafka
 
-rapids-conda-retry mambabuild \
+RAPIDS_PACKAGE_VERSION=${version} rapids-conda-retry mambabuild \
   --no-test \
   --channel "${CPP_CHANNEL}" \
   --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \
diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh
index a1d52c55b17..ae1d9c3fb1a 100755
--- a/ci/build_wheel.sh
+++ b/ci/build_wheel.sh
@@ -9,9 +9,8 @@ package_dir=$2
 source rapids-configure-sccache
 source rapids-date-string
 
-# Use gha-tools rapids-pip-wheel-version to generate wheel version then
-# update the necessary files
-version_override="$(rapids-pip-wheel-version ${RAPIDS_DATE_STRING})"
+version=$(rapids-generate-version)
+commit=$(git rev-parse HEAD)
 
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 
@@ -22,8 +21,9 @@ PACKAGE_CUDA_SUFFIX="-${RAPIDS_PY_CUDA_SUFFIX}"
 # Patch project metadata files to include the CUDA version suffix and version override.
 pyproject_file="${package_dir}/pyproject.toml"
 
-sed -i "s/^version = .*/version = \"${version_override}\"/g" ${pyproject_file}
-sed -i "s/^name = .*/name = \"${package_name}${PACKAGE_CUDA_SUFFIX}\"/g" ${pyproject_file}
+sed -i "s/^name = \"${package_name}\"/name = \"${package_name}${PACKAGE_CUDA_SUFFIX}\"/g" ${pyproject_file}
+echo "${version}" > VERSION
+sed -i "/^__git_commit__/ s/= .*/= \"${commit}\"/g" "${package_dir}/${package_name}/_version.py"
 
 # For nightlies we want to ensure that we're pulling in alphas as well. The
 # easiest way to do so is to augment the spec with a constraint containing a
@@ -36,6 +36,8 @@ fi
 
 if [[ ${package_name} == "dask_cudf" ]]; then
     sed -r -i "s/cudf==(.*)\"/cudf${PACKAGE_CUDA_SUFFIX}==\1${alpha_spec}\"/g" ${pyproject_file}
+    sed -r -i "s/dask-cuda==(.*)\"/dask-cuda==\1${alpha_spec}\"/g" ${pyproject_file}
+    sed -r -i "s/rapids-dask-dependency==(.*)\"/rapids-dask-dependency==\1${alpha_spec}\"/g" ${pyproject_file}
 else
     sed -r -i "s/rmm(.*)\"/rmm${PACKAGE_CUDA_SUFFIX}\1${alpha_spec}\"/g" ${pyproject_file}
     # ptxcompiler and cubinlinker aren't version constrained
diff --git a/ci/build_wheel_cudf.sh b/ci/build_wheel_cudf.sh
index 1b2285b5f22..456a3a289d1 100755
--- a/ci/build_wheel_cudf.sh
+++ b/ci/build_wheel_cudf.sh
@@ -7,20 +7,10 @@ package_dir="python/cudf"
 
 export SKBUILD_CONFIGURE_OPTIONS="-DCUDF_BUILD_WHEELS=ON -DDETECT_CONDA_ENV=OFF"
 
-# Force a build using the latest version of the code before this PR
-CUDF_BUILD_BRANCH=${1:-""}
-WHEEL_NAME="cudf"
-if [[ "${CUDF_BUILD_BRANCH}" == "main" ]]; then
-    MAIN_COMMIT=$(git merge-base HEAD origin/branch-23.10-xdf)
-    git checkout $MAIN_COMMIT
-    WHEEL_NAME="${WHEEL_NAME}_${CUDF_BUILD_BRANCH}"
-fi
+./ci/build_wheel.sh cudf ${package_dir}
 
-./ci/build_wheel.sh ${WHEEL_NAME} ${package_dir}
-
-mkdir -p ${package_dir}/final_dist
 python -m auditwheel repair -w ${package_dir}/final_dist ${package_dir}/dist/*
 
 
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
-RAPIDS_PY_WHEEL_NAME="${WHEEL_NAME}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 ${package_dir}/final_dist
+RAPIDS_PY_WHEEL_NAME="cudf_${AUDITWHEEL_POLICY}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 ${package_dir}/final_dist
diff --git a/ci/check_style.sh b/ci/check_style.sh
index e96ad8bf1db..a01cf4dcc6b 100755
--- a/ci/check_style.sh
+++ b/ci/check_style.sh
@@ -14,7 +14,7 @@ rapids-dependency-file-generator \
 rapids-mamba-retry env create --force -f env.yaml -n checks
 conda activate checks
 
-FORMAT_FILE_URL=https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-23.10/cmake-format-rapids-cmake.json
+FORMAT_FILE_URL=https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-23.12/cmake-format-rapids-cmake.json
 export RAPIDS_CMAKE_FORMAT_FILE=/tmp/rapids_cmake_ci/cmake-formats-rapids-cmake.json
 mkdir -p $(dirname ${RAPIDS_CMAKE_FORMAT_FILE})
 wget -O ${RAPIDS_CMAKE_FORMAT_FILE} ${FORMAT_FILE_URL}
diff --git a/ci/cudf_pandas_scripts/pandas-tests/run.sh b/ci/cudf_pandas_scripts/pandas-tests/run.sh
index 920625b452f..d36b609799b 100755
--- a/ci/cudf_pandas_scripts/pandas-tests/run.sh
+++ b/ci/cudf_pandas_scripts/pandas-tests/run.sh
@@ -8,16 +8,21 @@ PANDAS_TESTS_BRANCH=${1}
 rapids-logger "Running Pandas tests using $PANDAS_TESTS_BRANCH branch"
 rapids-logger "PR number: $RAPIDS_REF_NAME"
 
-
-COMMIT=$(git rev-parse HEAD)
-WHEEL_NAME="cudf"
-if [[ "${PANDAS_TESTS_BRANCH}" == "main" ]]; then
-    COMMIT=$(git merge-base HEAD origin/branch-23.10-xdf)
-    WHEEL_NAME="${WHEEL_NAME}_${PANDAS_TESTS_BRANCH}"
+# Set the manylinux version used for downloading the wheels so that we test the
+# newer ABI wheels on the newer images that support their installation.
+# Need to disable pipefail for the head not to fail, see
+# https://stackoverflow.com/questions/19120263/why-exit-code-141-with-grep-q
+set +o pipefail
+glibc_minor_version=$(ldd --version | head -1 | grep -o "[0-9]\.[0-9]\+" | tail -1 | cut -d '.' -f2)
+set -o pipefail
+manylinux_version="2_17"
+if [[ ${glibc_minor_version} -ge 28 ]]; then
+    manylinux_version="2_28"
 fi
+manylinux="manylinux_${manylinux_version}"
 
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
-RAPIDS_PY_WHEEL_NAME="${WHEEL_NAME}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-cudf-dep
+RAPIDS_PY_WHEEL_NAME="cudf_${manylinux}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-cudf-dep
 python -m pip install $(ls ./local-cudf-dep/cudf*.whl)[test,pandas_tests]
 
 git checkout $COMMIT
diff --git a/ci/cudf_pandas_scripts/run_tests.sh b/ci/cudf_pandas_scripts/run_tests.sh
index cc578b50fd0..7eab3221e5e 100755
--- a/ci/cudf_pandas_scripts/run_tests.sh
+++ b/ci/cudf_pandas_scripts/run_tests.sh
@@ -31,8 +31,21 @@ done
 if [ "$no_cudf" = true ]; then
     echo "Skipping cudf install"
 else
+    # Set the manylinux version used for downloading the wheels so that we test the
+    # newer ABI wheels on the newer images that support their installation.
+    # Need to disable pipefail for the head not to fail, see
+    # https://stackoverflow.com/questions/19120263/why-exit-code-141-with-grep-q
+    set +o pipefail
+    glibc_minor_version=$(ldd --version | head -1 | grep -o "[0-9]\.[0-9]\+" | tail -1 | cut -d '.' -f2)
+    set -o pipefail
+    manylinux_version="2_17"
+    if [[ ${glibc_minor_version} -ge 28 ]]; then
+        manylinux_version="2_28"
+    fi
+    manylinux="manylinux_${manylinux_version}"
+
     RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
-    RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-cudf-dep
+    RAPIDS_PY_WHEEL_NAME="cudf_${manylinux}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-cudf-dep
     python -m pip install $(ls ./local-cudf-dep/cudf*.whl)[test,cudf_pandas_tests]
 fi
 
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index eac64fe1a0f..16742465c32 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -43,6 +43,7 @@ sed_runner 's/'"VERSION ${CURRENT_SHORT_TAG}.*"'/'"VERSION ${NEXT_FULL_TAG}"'/g'
 
 # Python CMakeLists updates
 sed_runner 's/'"cudf_version .*)"'/'"cudf_version ${NEXT_FULL_TAG})"'/g' python/cudf/CMakeLists.txt
+sed_runner 's/'"cudf_kafka_version .*)"'/'"cudf_kafka_version ${NEXT_FULL_TAG})"'/g' python/cudf_kafka/CMakeLists.txt
 
 # cpp libcudf_kafka update
 sed_runner 's/'"VERSION ${CURRENT_SHORT_TAG}.*"'/'"VERSION ${NEXT_FULL_TAG}"'/g' cpp/libcudf_kafka/CMakeLists.txt
@@ -50,17 +51,8 @@ sed_runner 's/'"VERSION ${CURRENT_SHORT_TAG}.*"'/'"VERSION ${NEXT_FULL_TAG}"'/g'
 # cpp cudf_jni update
 sed_runner 's/'"VERSION ${CURRENT_SHORT_TAG}.*"'/'"VERSION ${NEXT_FULL_TAG}"'/g' java/src/main/native/CMakeLists.txt
 
-# Python __init__.py updates
-sed_runner "s/__version__ = .*/__version__ = \"${NEXT_FULL_TAG}\"/g" python/cudf/cudf/__init__.py
-sed_runner "s/__version__ = .*/__version__ = \"${NEXT_FULL_TAG}\"/g" python/dask_cudf/dask_cudf/__init__.py
-sed_runner "s/__version__ = .*/__version__ = \"${NEXT_FULL_TAG}\"/g" python/cudf_kafka/cudf_kafka/__init__.py
-sed_runner "s/__version__ = .*/__version__ = \"${NEXT_FULL_TAG}\"/g" python/custreamz/custreamz/__init__.py
-
-# Python pyproject.toml updates
-sed_runner "s/^version = .*/version = \"${NEXT_FULL_TAG}\"/g" python/cudf/pyproject.toml
-sed_runner "s/^version = .*/version = \"${NEXT_FULL_TAG}\"/g" python/dask_cudf/pyproject.toml
-sed_runner "s/^version = .*/version = \"${NEXT_FULL_TAG}\"/g" python/cudf_kafka/pyproject.toml
-sed_runner "s/^version = .*/version = \"${NEXT_FULL_TAG}\"/g" python/custreamz/pyproject.toml
+# Centralized version file update
+echo "${NEXT_FULL_TAG}" > VERSION
 
 # Wheel testing script
 sed_runner "s/branch-.*/branch-${NEXT_SHORT_TAG}/g" ci/test_wheel_dask_cudf.sh
@@ -89,6 +81,7 @@ DEPENDENCIES=(
   kvikio
   libkvikio
   librmm
+  rapids-dask-dependency
   rmm
 )
 for DEP in "${DEPENDENCIES[@]}"; do
@@ -108,8 +101,7 @@ sed_runner "s/version == ${CURRENT_SHORT_TAG}/version == ${NEXT_SHORT_TAG}/g" RE
 sed_runner "s/cudf=${CURRENT_SHORT_TAG}/cudf=${NEXT_SHORT_TAG}/g" README.md
 
 # Libcudf examples update
-sed_runner "s/CUDF_TAG branch-${CURRENT_SHORT_TAG}/CUDF_TAG branch-${NEXT_SHORT_TAG}/" cpp/examples/basic/CMakeLists.txt
-sed_runner "s/CUDF_TAG branch-${CURRENT_SHORT_TAG}/CUDF_TAG branch-${NEXT_SHORT_TAG}/" cpp/examples/strings/CMakeLists.txt
+sed_runner "s/CUDF_TAG branch-${CURRENT_SHORT_TAG}/CUDF_TAG branch-${NEXT_SHORT_TAG}/" cpp/examples/fetch_dependencies.cmake
 
 # CI files
 for FILE in .github/workflows/*.yaml; do
diff --git a/ci/test_wheel_cudf.sh b/ci/test_wheel_cudf.sh
index 83e24ab3ff1..8c42651e299 100755
--- a/ci/test_wheel_cudf.sh
+++ b/ci/test_wheel_cudf.sh
@@ -3,8 +3,21 @@
 
 set -eou pipefail
 
+# Set the manylinux version used for downloading the wheels so that we test the
+# newer ABI wheels on the newer images that support their installation.
+# Need to disable pipefail for the head not to fail, see
+# https://stackoverflow.com/questions/19120263/why-exit-code-141-with-grep-q
+set +o pipefail
+glibc_minor_version=$(ldd --version | head -1 | grep -o "[0-9]\.[0-9]\+" | tail -1 | cut -d '.' -f2)
+set -o pipefail
+manylinux_version="2_17"
+if [[ ${glibc_minor_version} -ge 28 ]]; then
+    manylinux_version="2_28"
+fi
+manylinux="manylinux_${manylinux_version}"
+
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
-RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
+RAPIDS_PY_WHEEL_NAME="cudf_${manylinux}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
 
 # echo to expand wildcard before adding `[extra]` requires for pip
 python -m pip install $(echo ./dist/cudf*.whl)[test]
diff --git a/ci/test_wheel_dask_cudf.sh b/ci/test_wheel_dask_cudf.sh
index a0a6fbede13..e9162b816aa 100755
--- a/ci/test_wheel_dask_cudf.sh
+++ b/ci/test_wheel_dask_cudf.sh
@@ -7,13 +7,24 @@ RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 RAPIDS_PY_WHEEL_NAME="dask_cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
 
 # Download the cudf built in the previous step
-RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-cudf-dep
-python -m pip install --no-deps ./local-cudf-dep/cudf*.whl
+# Set the manylinux version used for downloading the wheels so that we test the
+# newer ABI wheels on the newer images that support their installation.
+# Need to disable pipefail for the head not to fail, see
+# https://stackoverflow.com/questions/19120263/why-exit-code-141-with-grep-q
+set +o pipefail
+glibc_minor_version=$(ldd --version | head -1 | grep -o "[0-9]\.[0-9]\+" | tail -1 | cut -d '.' -f2)
+set -o pipefail
+manylinux_version="2_17"
+if [[ ${glibc_minor_version} -ge 28 ]]; then
+    manylinux_version="2_28"
+fi
+manylinux="manylinux_${manylinux_version}"
 
-# Always install latest dask for testing
-python -m pip install git+https://github.com/dask/dask.git@2023.9.2 git+https://github.com/dask/distributed.git@2023.9.2 git+https://github.com/rapidsai/dask-cuda.git@branch-23.10
+RAPIDS_PY_WHEEL_NAME="cudf_${manylinux}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-cudf-dep
+python -m pip install --no-deps ./local-cudf-dep/cudf*.whl
 
 # echo to expand wildcard before adding `[extra]` requires for pip
 python -m pip install $(echo ./dist/dask_cudf*.whl)[test]
 
+# Run tests in dask_cudf/tests and dask_cudf/io/tests
 python -m pytest -n 8 ./python/dask_cudf/dask_cudf/
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 27a3a84e3f1..9b85888a7b3 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -24,11 +24,8 @@ dependencies:
 - cudatoolkit
 - cupy>=12.0.0
 - cxx-compiler
-- cython>=3.0.0
-- dask-core==2023.9.2
-- dask-cuda==23.10.*
-- dask==2023.9.2
-- distributed==2023.9.2
+- cython>=3.0.3
+- dask-cuda==23.12.*
 - dlpack>=0.5,<0.6.0a0
 - doxygen=1.9.1
 - fastavro>=0.22.9
@@ -40,14 +37,14 @@ dependencies:
 - hypothesis
 - identify>=2.5.20
 - ipython
-- libarrow==12.0.1.*
+- libarrow-all==14.0.1.*
 - libcufile-dev=1.4.0.31
 - libcufile=1.4.0.31
 - libcurand-dev=10.3.0.86
 - libcurand=10.3.0.86
-- libkvikio==23.10.*
+- libkvikio==23.12.*
 - librdkafka>=1.9.0,<1.10.0a0
-- librmm==23.10.*
+- librmm==23.12.*
 - make
 - mimesis>=4.1.0
 - moto>=4.0.8
@@ -60,7 +57,7 @@ dependencies:
 - numpy>=1.21,<1.25
 - numpydoc
 - nvcc_linux-64=11.8
-- nvcomp==2.6.1
+- nvcomp==3.0.4
 - nvtx>=0.2.1
 - packaging
 - pandas>=1.3,<1.6.0dev0
@@ -69,9 +66,8 @@ dependencies:
 - pre-commit
 - protobuf>=4.21,<5
 - ptxcompiler
-- pyarrow==12.0.1.*
-- pydata-sphinx-theme
-- pyorc
+- pyarrow==14.0.1.*
+- pydata-sphinx-theme!=0.14.2
 - pytest
 - pytest-benchmark
 - pytest-cases
@@ -81,8 +77,9 @@ dependencies:
 - python-snappy>=0.6.0
 - python>=3.9,<3.11
 - pytorch<1.12.0
+- rapids-dask-dependency==23.12.*
 - rich
-- rmm==23.10.*
+- rmm==23.12.*
 - s3fs>=2022.3.0
 - scikit-build>=0.13.1
 - scipy
diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
index eb229f15af2..a3eeb3dd99f 100644
--- a/conda/environments/all_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -25,11 +25,8 @@ dependencies:
 - cuda-version=12.0
 - cupy>=12.0.0
 - cxx-compiler
-- cython>=3.0.0
-- dask-core==2023.9.2
-- dask-cuda==23.10.*
-- dask==2023.9.2
-- distributed==2023.9.2
+- cython>=3.0.3
+- dask-cuda==23.12.*
 - dlpack>=0.5,<0.6.0a0
 - doxygen=1.9.1
 - fastavro>=0.22.9
@@ -41,12 +38,12 @@ dependencies:
 - hypothesis
 - identify>=2.5.20
 - ipython
-- libarrow==12.0.1.*
+- libarrow-all==14.0.1.*
 - libcufile-dev
 - libcurand-dev
-- libkvikio==23.10.*
+- libkvikio==23.12.*
 - librdkafka>=1.9.0,<1.10.0a0
-- librmm==23.10.*
+- librmm==23.12.*
 - make
 - mimesis>=4.1.0
 - moto>=4.0.8
@@ -58,7 +55,7 @@ dependencies:
 - numba>=0.57,<0.58
 - numpy>=1.21,<1.25
 - numpydoc
-- nvcomp==2.6.1
+- nvcomp==3.0.4
 - nvtx>=0.2.1
 - packaging
 - pandas>=1.3,<1.6.0dev0
@@ -66,9 +63,8 @@ dependencies:
 - pip
 - pre-commit
 - protobuf>=4.21,<5
-- pyarrow==12.0.1.*
-- pydata-sphinx-theme
-- pyorc
+- pyarrow==14.0.1.*
+- pydata-sphinx-theme!=0.14.2
 - pytest
 - pytest-benchmark
 - pytest-cases
@@ -78,8 +74,9 @@ dependencies:
 - python-snappy>=0.6.0
 - python>=3.9,<3.11
 - pytorch<1.12.0
+- rapids-dask-dependency==23.12.*
 - rich
-- rmm==23.10.*
+- rmm==23.12.*
 - s3fs>=2022.3.0
 - scikit-build>=0.13.1
 - scipy
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index d3e15f70ccb..27edde1c98a 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -1,6 +1,6 @@
 # Copyright (c) 2018-2023, NVIDIA CORPORATION.
 
-{% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') %}
+{% set version = environ['RAPIDS_PACKAGE_VERSION'].lstrip('v') %}
 {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %}
 {% set py_version = environ['CONDA_PY'] %}
 {% set cuda_version = '.'.join(environ['RAPIDS_CUDA_VERSION'].split('.')[:2]) %}
@@ -12,7 +12,7 @@ package:
   version: {{ version }}
 
 source:
-  git_url: ../../..
+  path: ../../..
 
 build:
   number: {{ GIT_DESCRIBE_NUMBER }}
@@ -55,13 +55,13 @@ requirements:
     - cuda-version ={{ cuda_version }}
     - sysroot_{{ target_platform }} {{ sysroot_version }}
   host:
-    - protobuf ==4.21.*
+    - protobuf ==4.24.*
     - python
-    - cython >=3.0.0
+    - cython >=3.0.3
     - scikit-build >=0.13.1
     - setuptools
     - dlpack >=0.5,<0.6.0a0
-    - pyarrow =12
+    - pyarrow ==14.0.1.*
     - libcudf ={{ version }}
     - rmm ={{ minor_version }}
     {% if cuda_major == "11" %}
@@ -82,7 +82,7 @@ requirements:
     - numba >=0.57,<0.58
     # TODO: Pin to numpy<1.25 until cudf requires pandas 2
     - numpy >=1.21,<1.25
-    - {{ pin_compatible('pyarrow', max_pin='x.x.x') }}
+    - {{ pin_compatible('pyarrow', max_pin='x') }}
     - libcudf ={{ version }}
     - {{ pin_compatible('rmm', max_pin='x.x') }}
     - fsspec >=0.6.0
diff --git a/conda/recipes/cudf_kafka/build.sh b/conda/recipes/cudf_kafka/build.sh
index f4bb6e1bc91..9458349d101 100644
--- a/conda/recipes/cudf_kafka/build.sh
+++ b/conda/recipes/cudf_kafka/build.sh
@@ -1,16 +1,3 @@
 # Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
-# This assumes the script is executed from the root of the repo directory
-# Need to set CUDA_HOME inside conda environments because the hacked together
-# setup.py for cudf-kafka searches that way.
-# TODO: Remove after https://github.com/rapidsai/cudf/pull/14292 updates
-# cudf_kafka to use scikit-build
-CUDA_MAJOR=${RAPIDS_CUDA_VERSION%%.*}
-if [[ ${CUDA_MAJOR} == "12" ]]; then
-    target_name="x86_64-linux"
-    if [[ ! $(arch) == "x86_64" ]]; then
-        target_name="sbsa-linux"
-    fi
-    export CUDA_HOME="${PREFIX}/targets/${target_name}/"
-fi
 ./build.sh -v cudf_kafka
diff --git a/conda/recipes/cudf_kafka/conda_build_config.yaml b/conda/recipes/cudf_kafka/conda_build_config.yaml
index b63a136ad2d..c98c2701653 100644
--- a/conda/recipes/cudf_kafka/conda_build_config.yaml
+++ b/conda/recipes/cudf_kafka/conda_build_config.yaml
@@ -9,3 +9,9 @@ sysroot_version:
 
 cmake_version:
   - ">=3.26.4"
+
+cuda_compiler:
+  - cuda-nvcc
+
+cuda11_compiler:
+  - nvcc
diff --git a/conda/recipes/cudf_kafka/meta.yaml b/conda/recipes/cudf_kafka/meta.yaml
index a79c23b7d98..343ec2519f1 100644
--- a/conda/recipes/cudf_kafka/meta.yaml
+++ b/conda/recipes/cudf_kafka/meta.yaml
@@ -1,6 +1,6 @@
 # Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
-{% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') %}
+{% set version = environ['RAPIDS_PACKAGE_VERSION'].lstrip('v') %}
 {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %}
 {% set py_version = environ['CONDA_PY'] %}
 {% set cuda_version = '.'.join(environ['RAPIDS_CUDA_VERSION'].split('.')[:2]) %}
@@ -12,7 +12,7 @@ package:
   version: {{ version }}
 
 source:
-  git_url: ../../..
+  path: ../../..
 
 build:
   number: {{ GIT_DESCRIBE_NUMBER }}
@@ -33,28 +33,31 @@ build:
     - SCCACHE_S3_KEY_PREFIX=cudf-kafka-linux64 # [linux64]
     - SCCACHE_S3_USE_SSL
     - SCCACHE_S3_NO_CREDENTIALS
-    # TODO: Remove after https://github.com/rapidsai/cudf/pull/14292 updates
-    # cudf_kafka to use scikit-build
-    - RAPIDS_CUDA_VERSION
+  ignore_run_exports_from:
+    {% if cuda_major == "11" %}
+    - {{ compiler('cuda11') }}
+    {% endif %}
 
 requirements:
   build:
     - cmake {{ cmake_version }}
+    - ninja
     - {{ compiler('c') }}
     - {{ compiler('cxx') }}
-    - ninja
-    - sysroot_{{ target_platform }} {{ sysroot_version }}
-    # TODO: Remove after https://github.com/rapidsai/cudf/pull/14292 updates
-    # cudf_kafka to use scikit-build
-    {% if cuda_major == "12" %}
-    - cuda-gdb
+    {% if cuda_major == "11" %}
+    - {{ compiler('cuda11') }} ={{ cuda_version }}
+    {% else %}
+    - {{ compiler('cuda') }}
     {% endif %}
+    - cuda-version ={{ cuda_version }}
+    - sysroot_{{ target_platform }} {{ sysroot_version }}
   host:
     - python
-    - cython >=3.0.0
+    - cython >=3.0.3
     - cuda-version ={{ cuda_version }}
     - cudf ={{ version }}
     - libcudf_kafka ={{ version }}
+    - scikit-build >=0.13.1
     - setuptools
     {% if cuda_major == "12" %}
     - cuda-cudart-dev
diff --git a/conda/recipes/custreamz/meta.yaml b/conda/recipes/custreamz/meta.yaml
index 233d51baf31..755394e3936 100644
--- a/conda/recipes/custreamz/meta.yaml
+++ b/conda/recipes/custreamz/meta.yaml
@@ -1,6 +1,6 @@
 # Copyright (c) 2018-2023, NVIDIA CORPORATION.
 
-{% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') %}
+{% set version = environ['RAPIDS_PACKAGE_VERSION'].lstrip('v') %}
 {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %}
 {% set py_version = environ['CONDA_PY'] %}
 {% set cuda_version = '.'.join(environ['RAPIDS_CUDA_VERSION'].split('.')[:2]) %}
@@ -12,7 +12,7 @@ package:
   version: {{ version }}
 
 source:
-  git_url: ../../..
+  path: ../../..
 
 build:
   number: {{ GIT_DESCRIBE_NUMBER }}
@@ -45,9 +45,7 @@ requirements:
     - streamz
     - cudf ={{ version }}
     - cudf_kafka ={{ version }}
-    - dask ==2023.9.2
-    - dask-core ==2023.9.2
-    - distributed ==2023.9.2
+    - rapids-dask-dependency ={{ minor_version }}
     - python-confluent-kafka >=1.9.0,<1.10.0a0
     - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
 
diff --git a/conda/recipes/dask-cudf/meta.yaml b/conda/recipes/dask-cudf/meta.yaml
index 4c8af071074..16638926492 100644
--- a/conda/recipes/dask-cudf/meta.yaml
+++ b/conda/recipes/dask-cudf/meta.yaml
@@ -1,6 +1,6 @@
 # Copyright (c) 2018-2023, NVIDIA CORPORATION.
 
-{% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') %}
+{% set version = environ['RAPIDS_PACKAGE_VERSION'].lstrip('v') %}
 {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %}
 {% set py_version = environ['CONDA_PY'] %}
 {% set cuda_version = '.'.join(environ['RAPIDS_CUDA_VERSION'].split('.')[:2]) %}
@@ -12,7 +12,7 @@ package:
   version: {{ version }}
 
 source:
-  git_url: ../../..
+  path: ../../..
 
 build:
   number: {{ GIT_DESCRIBE_NUMBER }}
@@ -37,17 +37,11 @@ build:
 requirements:
   host:
     - python
-    - cudf ={{ version }}
-    - dask ==2023.9.2
-    - dask-core ==2023.9.2
-    - distributed ==2023.9.2
     - cuda-version ={{ cuda_version }}
   run:
     - python
     - cudf ={{ version }}
-    - dask ==2023.9.2
-    - dask-core ==2023.9.2
-    - distributed ==2023.9.2
+    - rapids-dask-dependency ={{ minor_version }}
     - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
 
 test:
diff --git a/conda/recipes/dask-cudf/run_test.sh b/conda/recipes/dask-cudf/run_test.sh
deleted file mode 100644
index c79c014a89a..00000000000
--- a/conda/recipes/dask-cudf/run_test.sh
+++ /dev/null
@@ -1,36 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
-
-set -e
-
-# Logger function for build status output
-function logger() {
-  echo -e "\n>>>> $@\n"
-}
-
-# Importing cudf on arm64 CPU only nodes is currently not working due to a
-# difference in reported gpu devices between arm64 and amd64
-ARCH=$(arch)
-
-if [ "${ARCH}" = "aarch64" ]; then
-  logger "Skipping tests on arm64"
-  exit 0
-fi
-
-# Dask & Distributed option to install main(nightly) or `conda-forge` packages.
-export INSTALL_DASK_MAIN=0
-
-# Dask version to install when `INSTALL_DASK_MAIN=0`
-export DASK_STABLE_VERSION="2023.9.2"
-
-# Install the conda-forge or nightly version of dask and distributed
-if [[ "${INSTALL_DASK_MAIN}" == 1 ]]; then
-    rapids-logger "rapids-mamba-retry install -c dask/label/dev 'dask/label/dev::dask' 'dask/label/dev::distributed'"
-    rapids-mamba-retry install -c dask/label/dev "dask/label/dev::dask" "dask/label/dev::distributed"
-else
-    rapids-logger "rapids-mamba-retry install conda-forge::dask=={$DASK_STABLE_VERSION} conda-forge::distributed=={$DASK_STABLE_VERSION} conda-forge::dask-core=={$DASK_STABLE_VERSION} --force-reinstall"
-    rapids-mamba-retry install conda-forge::dask=={$DASK_STABLE_VERSION} conda-forge::distributed=={$DASK_STABLE_VERSION} conda-forge::dask-core=={$DASK_STABLE_VERSION} --force-reinstall
-fi
-
-logger "python -c 'import dask_cudf'"
-python -c "import dask_cudf"
diff --git a/conda/recipes/libcudf/conda_build_config.yaml b/conda/recipes/libcudf/conda_build_config.yaml
index 25b3f19de77..fa06ed048b7 100644
--- a/conda/recipes/libcudf/conda_build_config.yaml
+++ b/conda/recipes/libcudf/conda_build_config.yaml
@@ -23,7 +23,7 @@ gtest_version:
   - ">=1.13.0"
 
 libarrow_version:
-  - "=12"
+  - "==14.0.1"
 
 dlpack_version:
   - ">=0.5,<0.6.0a0"
@@ -38,7 +38,7 @@ spdlog_version:
   - ">=1.11.0,<1.12"
 
 nvcomp_version:
-  - "=2.6.1"
+  - "=3.0.4"
 
 zlib_version:
   - ">=1.2.13"
diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index 627065817ba..0459908fd00 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -1,6 +1,6 @@
 # Copyright (c) 2018-2023, NVIDIA CORPORATION.
 
-{% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') %}
+{% set version = environ['RAPIDS_PACKAGE_VERSION'].lstrip('v') %}
 {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %}
 {% set cuda_version = '.'.join(environ['RAPIDS_CUDA_VERSION'].split('.')[:2]) %}
 {% set cuda_major = cuda_version.split('.')[0] %}
@@ -11,7 +11,7 @@ package:
   name: libcudf-split
 
 source:
-  git_url: ../../..
+  path: ../../..
 
 build:
   script_env:
@@ -91,6 +91,8 @@ outputs:
     requirements:
       build:
         - cmake {{ cmake_version }}
+      host:
+        - libarrow {{ libarrow_version }}
       run:
         {% if cuda_major == "11" %}
         - cudatoolkit
@@ -103,7 +105,6 @@ outputs:
         - nvcomp {{ nvcomp_version }}
         - librmm ={{ minor_version }}
         - libkvikio ={{ minor_version }}
-        - libarrow {{ libarrow_version }}
         - dlpack {{ dlpack_version }}
         - gtest {{ gtest_version }}
         - gmock {{ gtest_version }}
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index ec58c391001..bd9c936626a 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -25,7 +25,7 @@ rapids_cuda_init_architectures(CUDF)
 
 project(
   CUDF
-  VERSION 23.10.00
+  VERSION 23.12.00
   LANGUAGES C CXX CUDA
 )
 if(CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA" AND CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 11.5)
@@ -401,6 +401,7 @@ add_library(
   src/io/parquet/predicate_pushdown.cpp
   src/io/parquet/reader.cpp
   src/io/parquet/reader_impl.cpp
+  src/io/parquet/reader_impl_chunking.cu
   src/io/parquet/reader_impl_helpers.cpp
   src/io/parquet/reader_impl_preprocess.cu
   src/io/parquet/writer_impl.cu
@@ -439,6 +440,7 @@ add_library(
   src/join/mixed_join_size_kernel_nulls.cu
   src/join/mixed_join_size_kernels_semi.cu
   src/join/semi_join.cu
+  src/json/json_path.cu
   src/lists/contains.cu
   src/lists/combine/concatenate_list_elements.cu
   src/lists/combine/concatenate_rows.cu
@@ -570,7 +572,6 @@ add_library(
   src/strings/filter_chars.cu
   src/strings/like.cu
   src/strings/padding.cu
-  src/strings/json/json_path.cu
   src/strings/regex/regcomp.cpp
   src/strings/regex/regexec.cpp
   src/strings/regex/regex_program.cpp
@@ -581,6 +582,7 @@ add_library(
   src/strings/replace/replace.cu
   src/strings/replace/replace_re.cu
   src/strings/reverse.cu
+  src/strings/scan/scan_inclusive.cu
   src/strings/search/findall.cu
   src/strings/search/find.cu
   src/strings/search/find_multiple.cu
@@ -597,6 +599,7 @@ add_library(
   src/strings/utilities.cu
   src/strings/wrap.cu
   src/structs/copying/concatenate.cu
+  src/structs/scan/scan_inclusive.cu
   src/structs/structs_column_factories.cu
   src/structs/structs_column_view.cpp
   src/structs/utilities.cpp
@@ -613,10 +616,10 @@ add_library(
   src/text/normalize.cu
   src/text/replace.cu
   src/text/stemmer.cu
-  src/text/subword/bpe_tokenizer.cu
+  src/text/bpe/byte_pair_encoding.cu
+  src/text/bpe/load_merge_pairs.cu
   src/text/subword/data_normalizer.cu
   src/text/subword/load_hash_file.cu
-  src/text/subword/load_merges_file.cu
   src/text/subword/subword_tokenize.cu
   src/text/subword/wordpiece_tokenizer.cu
   src/text/tokenize.cu
@@ -834,6 +837,7 @@ if(CUDF_BUILD_TESTUTIL)
     tests/io/metadata_utilities.cpp
     tests/utilities/base_fixture.cpp
     tests/utilities/column_utilities.cu
+    tests/utilities/debug_utilities.cu
     tests/utilities/table_utilities.cu
     tests/utilities/tdigest_utilities.cu
   )
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index cd6b3cfdc03..9c3a05a2f5f 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -230,6 +230,7 @@ ConfigureNVBench(HASHING_NVBENCH hashing/hash.cpp)
 # ##################################################################################################
 # * merge benchmark -------------------------------------------------------------------------------
 ConfigureBench(MERGE_BENCH merge/merge.cpp)
+ConfigureNVBench(MERGE_NVBENCH merge/merge_structs.cpp merge/merge_lists.cpp)
 
 # ##################################################################################################
 # * null_mask benchmark ---------------------------------------------------------------------------
@@ -277,7 +278,7 @@ ConfigureBench(TEXT_BENCH text/ngrams.cpp text/subword.cpp)
 
 ConfigureNVBench(
   TEXT_NVBENCH text/edit_distance.cpp text/hash_ngrams.cpp text/jaccard.cpp text/minhash.cpp
-  text/normalize.cpp text/replace.cpp text/tokenize.cpp
+  text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp
 )
 
 # ##################################################################################################
@@ -319,7 +320,7 @@ ConfigureNVBench(
 
 # ##################################################################################################
 # * json benchmark -------------------------------------------------------------------
-ConfigureBench(JSON_BENCH string/json.cu)
+ConfigureBench(JSON_BENCH json/json.cu)
 ConfigureNVBench(FST_NVBENCH io/fst.cu)
 ConfigureNVBench(JSON_READER_NVBENCH io/json/nested_json.cpp io/json/json_reader_input.cpp)
 ConfigureNVBench(JSON_WRITER_NVBENCH io/json/json_writer.cpp)
diff --git a/cpp/benchmarks/sort/nested_types_common.hpp b/cpp/benchmarks/common/generate_nested_types.hpp
similarity index 98%
rename from cpp/benchmarks/sort/nested_types_common.hpp
rename to cpp/benchmarks/common/generate_nested_types.hpp
index 93853ba5768..ee9e3ca9de3 100644
--- a/cpp/benchmarks/sort/nested_types_common.hpp
+++ b/cpp/benchmarks/common/generate_nested_types.hpp
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <benchmarks/common/generate_input.hpp>
+#include "generate_input.hpp"
 
 #include <cudf_test/column_wrapper.hpp>
 
diff --git a/cpp/benchmarks/copying/shift.cu b/cpp/benchmarks/copying/shift.cu
index 460100a8fe9..e1169e3bcd6 100644
--- a/cpp/benchmarks/copying/shift.cu
+++ b/cpp/benchmarks/copying/shift.cu
@@ -56,18 +56,32 @@ static void BM_shift(benchmark::State& state)
   cudf::size_type size   = state.range(0);
   cudf::size_type offset = size * (static_cast<double>(shift_factor) / 100.0);
 
-  auto const input_table =
-    create_sequence_table({cudf::type_to_id<int>()},
-                          row_count{size},
-                          use_validity ? std::optional<double>{1.0} : std::nullopt);
+  auto constexpr column_type_id = cudf::type_id::INT32;
+  using column_type             = cudf::id_to_type<column_type_id>;
+
+  auto const input_table = create_sequence_table(
+    {column_type_id}, row_count{size}, use_validity ? std::optional<double>{1.0} : std::nullopt);
   cudf::column_view input{input_table->get_column(0)};
 
-  auto fill = use_validity ? make_scalar<int>() : make_scalar<int>(777);
+  auto fill = use_validity ? make_scalar<column_type>() : make_scalar<column_type>(777);
 
   for (auto _ : state) {
     cuda_event_timer raii(state, true);
     auto output = cudf::shift(input, offset, *fill);
   }
+
+  auto const elems_read = (size - offset);
+  auto const bytes_read = elems_read * sizeof(column_type);
+
+  // If 'use_validity' is false, the fill value is a number, and the entire column
+  // (excluding the null bitmask) needs to be written. On the other hand, if 'use_validity'
+  // is true, only the elements that can be shifted are written, along with the full null bitmask.
+  auto const elems_written = use_validity ? (size - offset) : size;
+  auto const bytes_written = elems_written * sizeof(column_type);
+  auto const null_bytes    = use_validity ? 2 * cudf::bitmask_allocation_size_bytes(size) : 0;
+
+  state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) *
+                          (bytes_written + bytes_read + null_bytes));
 }
 
 class Shift : public cudf::benchmark {};
diff --git a/cpp/benchmarks/string/json.cu b/cpp/benchmarks/json/json.cu
similarity index 98%
rename from cpp/benchmarks/string/json.cu
rename to cpp/benchmarks/json/json.cu
index 7e89edf3e17..5dc30aebe38 100644
--- a/cpp/benchmarks/string/json.cu
+++ b/cpp/benchmarks/json/json.cu
@@ -21,9 +21,9 @@
 #include <cudf_test/column_wrapper.hpp>
 
 #include <cudf/column/column_factories.hpp>
+#include <cudf/json/json.hpp>
 #include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/utilities.cuh>
-#include <cudf/strings/json.hpp>
 #include <cudf/strings/string_view.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/types.hpp>
@@ -196,7 +196,7 @@ void BM_case(benchmark::State& state, std::string query_arg)
 
   for (auto _ : state) {
     cuda_event_timer raii(state, true);
-    auto result = cudf::strings::get_json_object(scv, json_path);
+    auto result = cudf::get_json_object(scv, json_path);
     CUDF_CUDA_TRY(cudaStreamSynchronize(0));
   }
 
diff --git a/cpp/benchmarks/lists/set_operations.cpp b/cpp/benchmarks/lists/set_operations.cpp
index 5b240923358..6bed33d2570 100644
--- a/cpp/benchmarks/lists/set_operations.cpp
+++ b/cpp/benchmarks/lists/set_operations.cpp
@@ -54,6 +54,7 @@ void nvbench_set_op(nvbench::state& state, BenchFuncPtr bfunc)
           cudf::lists_column_view{*rhs},
           cudf::null_equality::EQUAL,
           cudf::nan_equality::ALL_EQUAL,
+          cudf::get_default_stream(),
           rmm::mr::get_current_device_resource());
   });
 }
diff --git a/cpp/benchmarks/merge/merge_lists.cpp b/cpp/benchmarks/merge/merge_lists.cpp
new file mode 100644
index 00000000000..bcb9f10ac83
--- /dev/null
+++ b/cpp/benchmarks/merge/merge_lists.cpp
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_nested_types.hpp>
+
+#include <cudf/detail/merge.hpp>
+#include <cudf/detail/sorting.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+void nvbench_merge_list(nvbench::state& state)
+{
+  rmm::cuda_stream_view stream;
+
+  auto const input1 = create_lists_data(state);
+  auto const sorted_input1 =
+    cudf::detail::sort(*input1, {}, {}, stream, rmm::mr::get_current_device_resource());
+
+  auto const input2 = create_lists_data(state);
+  auto const sorted_input2 =
+    cudf::detail::sort(*input2, {}, {}, stream, rmm::mr::get_current_device_resource());
+
+  stream.synchronize();
+
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    rmm::cuda_stream_view stream_view{launch.get_stream()};
+
+    cudf::detail::merge({*sorted_input1, *sorted_input2},
+                        {0},
+                        {cudf::order::ASCENDING},
+                        {},
+                        stream_view,
+                        rmm::mr::get_current_device_resource());
+  });
+}
+
+NVBENCH_BENCH(nvbench_merge_list)
+  .set_name("merge_lists")
+  .add_int64_power_of_two_axis("size_bytes", {10, 18, 24, 28})
+  .add_int64_axis("depth", {1, 4})
+  .add_float64_axis("null_frequency", {0, 0.2});
diff --git a/cpp/benchmarks/merge/merge_structs.cpp b/cpp/benchmarks/merge/merge_structs.cpp
new file mode 100644
index 00000000000..9c56b44b623
--- /dev/null
+++ b/cpp/benchmarks/merge/merge_structs.cpp
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_nested_types.hpp>
+
+#include <cudf/detail/merge.hpp>
+#include <cudf/detail/sorting.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+void nvbench_merge_struct(nvbench::state& state)
+{
+  rmm::cuda_stream_view stream;
+
+  auto const input1 = create_structs_data(state);
+  auto const sorted_input1 =
+    cudf::detail::sort(*input1, {}, {}, stream, rmm::mr::get_current_device_resource());
+
+  auto const input2 = create_structs_data(state);
+  auto const sorted_input2 =
+    cudf::detail::sort(*input2, {}, {}, stream, rmm::mr::get_current_device_resource());
+
+  stream.synchronize();
+
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    rmm::cuda_stream_view stream_view{launch.get_stream()};
+
+    cudf::detail::merge({*sorted_input1, *sorted_input2},
+                        {0},
+                        {cudf::order::ASCENDING},
+                        {},
+                        stream_view,
+                        rmm::mr::get_current_device_resource());
+  });
+}
+
+NVBENCH_BENCH(nvbench_merge_struct)
+  .set_name("merge_struct")
+  .add_int64_power_of_two_axis("NumRows", {10, 18, 26})
+  .add_int64_axis("Depth", {0, 1, 8})
+  .add_int64_axis("Nulls", {0, 1});
diff --git a/cpp/benchmarks/sort/rank_lists.cpp b/cpp/benchmarks/sort/rank_lists.cpp
index 49dc409ebfc..c23f3c891f0 100644
--- a/cpp/benchmarks/sort/rank_lists.cpp
+++ b/cpp/benchmarks/sort/rank_lists.cpp
@@ -14,9 +14,10 @@
  * limitations under the License.
  */
 
-#include "nested_types_common.hpp"
 #include "rank_types_common.hpp"
 
+#include <benchmarks/common/generate_nested_types.hpp>
+
 #include <cudf/sorting.hpp>
 
 #include <cudf_test/column_utilities.hpp>
diff --git a/cpp/benchmarks/sort/rank_structs.cpp b/cpp/benchmarks/sort/rank_structs.cpp
index 85427e2128f..271b883e62a 100644
--- a/cpp/benchmarks/sort/rank_structs.cpp
+++ b/cpp/benchmarks/sort/rank_structs.cpp
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include "nested_types_common.hpp"
 #include "rank_types_common.hpp"
+#include <benchmarks/common/generate_nested_types.hpp>
 
 #include <cudf/sorting.hpp>
 
diff --git a/cpp/benchmarks/sort/sort_lists.cpp b/cpp/benchmarks/sort/sort_lists.cpp
index 4b04323a99f..2052de3688c 100644
--- a/cpp/benchmarks/sort/sort_lists.cpp
+++ b/cpp/benchmarks/sort/sort_lists.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "nested_types_common.hpp"
+#include <benchmarks/common/generate_nested_types.hpp>
 
 #include <cudf/detail/sorting.hpp>
 
diff --git a/cpp/benchmarks/sort/sort_structs.cpp b/cpp/benchmarks/sort/sort_structs.cpp
index 1d54fa42f6f..3a3d1080ba0 100644
--- a/cpp/benchmarks/sort/sort_structs.cpp
+++ b/cpp/benchmarks/sort/sort_structs.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "nested_types_common.hpp"
+#include <benchmarks/common/generate_nested_types.hpp>
 
 #include <cudf/detail/sorting.hpp>
 
diff --git a/cpp/benchmarks/stream_compaction/distinct_count.cpp b/cpp/benchmarks/stream_compaction/distinct_count.cpp
index 2b2c901b90f..3e324013d4e 100644
--- a/cpp/benchmarks/stream_compaction/distinct_count.cpp
+++ b/cpp/benchmarks/stream_compaction/distinct_count.cpp
@@ -40,6 +40,14 @@ static void bench_distinct_count(nvbench::state& state, nvbench::type_list<Type>
   auto const& data_column = data_table->get_column(0);
   auto const input_table  = cudf::table_view{{data_column, data_column, data_column}};
 
+  // Collect memory statistics for input and output.
+  state.add_global_memory_reads<Type>(input_table.num_rows() * input_table.num_columns());
+  state.add_global_memory_writes<cudf::size_type>(1);
+  if (null_probability > 0) {
+    state.add_global_memory_reads<nvbench::int8_t>(
+      input_table.num_columns() * cudf::bitmask_allocation_size_bytes(input_table.num_rows()));
+  }
+
   auto mem_stats_logger = cudf::memory_stats_logger();  // init stats logger
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
diff --git a/cpp/benchmarks/text/vocab.cpp b/cpp/benchmarks/text/vocab.cpp
new file mode 100644
index 00000000000..80942e2697d
--- /dev/null
+++ b/cpp/benchmarks/text/vocab.cpp
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+
+#include <cudf_test/column_wrapper.hpp>
+
+#include <cudf/reduction.hpp>
+#include <nvtext/tokenize.hpp>
+
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/strings/char_types/char_types.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/default_stream.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+static void bench_vocab_tokenize(nvbench::state& state)
+{
+  auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
+
+  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
+      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
+    state.skip("Skip benchmarks greater than size_type limit");
+  }
+
+  auto const column = [num_rows, row_width] {
+    data_profile const profile = data_profile_builder().no_validity().distribution(
+      cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
+    auto const col = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile);
+    return cudf::strings::filter_characters_of_type(
+      cudf::strings_column_view(col->view()),
+      cudf::strings::string_character_types::ALL_TYPES,
+      cudf::string_scalar(" "),
+      cudf::strings::string_character_types::ALPHANUM);
+  }();
+  cudf::strings_column_view input(column->view());
+
+  auto const vocab_col = [] {
+    data_profile const profile = data_profile_builder().no_validity().distribution(
+      cudf::type_id::STRING, distribution_id::NORMAL, 0, 15);
+    auto const col = create_random_column(cudf::type_id::STRING, row_count{100}, profile);
+    return cudf::strings::filter_characters_of_type(
+      cudf::strings_column_view(col->view()),
+      cudf::strings::string_character_types::ALL_TYPES,
+      cudf::string_scalar(""),
+      cudf::strings::string_character_types::ALPHANUM);
+  }();
+  auto const vocab = nvtext::load_vocabulary(cudf::strings_column_view(vocab_col->view()));
+
+  auto token_count = [input] {
+    auto const counts = nvtext::count_tokens(input);
+    auto const agg    = cudf::make_sum_aggregation<cudf::reduce_aggregation>();
+    auto const count  = cudf::reduce(counts->view(), *agg, counts->type());
+    return static_cast<cudf::scalar_type_t<cudf::size_type>*>(count.get())
+      ->value(cudf::get_default_stream());
+  }();
+
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
+  auto chars_size = input.chars_size() + cudf::strings_column_view(vocab_col->view()).chars_size();
+  state.add_global_memory_reads<nvbench::int8_t>(chars_size);
+  state.add_global_memory_writes<nvbench::int32_t>(token_count);
+
+  auto const delimiter = cudf::string_scalar("");
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    auto result = nvtext::tokenize_with_vocabulary(input, *vocab, delimiter);
+  });
+}
+
+NVBENCH_BENCH(bench_vocab_tokenize)
+  .set_name("vocab_tokenize")
+  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024})
+  .add_int64_axis("num_rows", {262144, 524288, 1048576, 2097152, 4194304, 16777216});
diff --git a/cpp/benchmarks/transpose/transpose.cpp b/cpp/benchmarks/transpose/transpose.cpp
index 2f41bda4b88..c2737325462 100644
--- a/cpp/benchmarks/transpose/transpose.cpp
+++ b/cpp/benchmarks/transpose/transpose.cpp
@@ -20,17 +20,19 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/transpose.hpp>
+#include <cudf/types.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
 
 static void BM_transpose(benchmark::State& state)
 {
-  auto count = state.range(0);
+  auto count                    = state.range(0);
+  constexpr auto column_type_id = cudf::type_id::INT32;
   auto int_column_generator =
     thrust::make_transform_iterator(thrust::counting_iterator(0), [count](int i) {
       return cudf::make_numeric_column(
-        cudf::data_type{cudf::type_id::INT32}, count, cudf::mask_state::ALL_VALID);
+        cudf::data_type{column_type_id}, count, cudf::mask_state::ALL_VALID);
     });
 
   auto input_table = cudf::table(std::vector(int_column_generator, int_column_generator + count));
@@ -40,6 +42,17 @@ static void BM_transpose(benchmark::State& state)
     cuda_event_timer raii(state, true);
     auto output = cudf::transpose(input);
   }
+
+  // Collect memory statistics.
+  auto const bytes_read = static_cast<uint64_t>(input.num_columns()) * input.num_rows() *
+                          sizeof(cudf::id_to_type<column_type_id>);
+  auto const bytes_written = bytes_read;
+  // Account for nullability in input and output.
+  auto const null_bytes = 2 * static_cast<uint64_t>(input.num_columns()) *
+                          cudf::bitmask_allocation_size_bytes(input.num_rows());
+
+  state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) *
+                          (bytes_read + bytes_written + null_bytes));
 }
 
 class Transpose : public cudf::benchmark {};
diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake
index 894dc9649e2..05aa5730b4d 100644
--- a/cpp/cmake/thirdparty/get_arrow.cmake
+++ b/cpp/cmake/thirdparty/get_arrow.cmake
@@ -53,19 +53,35 @@ function(find_libarrow_in_python_wheel PYARROW_VERSION)
   find_package(Arrow ${PYARROW_VERSION} MODULE REQUIRED GLOBAL)
   add_library(arrow_shared ALIAS Arrow::Arrow)
 
-  # When using the libarrow inside a wheel we must build libcudf with the old ABI because pyarrow's
-  # `libarrow.so` is compiled for manylinux2014 (centos7 toolchain) which uses the old ABI. Note
-  # that these flags will often be redundant because we build wheels in manylinux containers that
-  # actually have the old libc++ anyway, but setting them explicitly ensures correct and consistent
-  # behavior in all other cases such as aarch builds on newer manylinux or testing builds in newer
-  # containers. Note that tests will not build successfully without also propagating these options
-  # to builds of GTest. Similarly, benchmarks will not work without updating GBench (and possibly
-  # NVBench) builds. We are currently ignoring these limitations since we don't anticipate using
-  # this feature except for building wheels.
-  target_compile_options(
-    Arrow::Arrow INTERFACE "$<$<COMPILE_LANGUAGE:CXX>:-D_GLIBCXX_USE_CXX11_ABI=0>"
-                           "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=-D_GLIBCXX_USE_CXX11_ABI=0>"
+  # When using the libarrow inside a wheel, whether or not libcudf may be built using the new C++11
+  # ABI is dependent on whether the libarrow inside the wheel was compiled using that ABI because we
+  # need the arrow library that we bundle in cudf to be ABI-compatible with the one inside pyarrow.
+  # We determine what options to use by checking the glibc version on the current system, which is
+  # also how pip determines which manylinux-versioned pyarrow wheel to install. Note that tests will
+  # not build successfully without also propagating these options to builds of GTest. Similarly,
+  # benchmarks will not work without updating GBench (and possibly NVBench) builds. We are currently
+  # ignoring these limitations since we don't anticipate using this feature except for building
+  # wheels.
+  EXECUTE_PROCESS(
+    COMMAND ${CMAKE_C_COMPILER} -print-file-name=libc.so.6
+    OUTPUT_VARIABLE GLIBC_EXECUTABLE
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+  )
+  EXECUTE_PROCESS(
+    COMMAND ${GLIBC_EXECUTABLE}
+    OUTPUT_VARIABLE GLIBC_OUTPUT
+    OUTPUT_STRIP_TRAILING_WHITESPACE
   )
+  STRING(REGEX MATCH "stable release version ([0-9]+\\.[0-9]+)" GLIBC_VERSION ${GLIBC_OUTPUT})
+  STRING(REPLACE "stable release version " "" GLIBC_VERSION ${GLIBC_VERSION})
+  STRING(REPLACE "." ";" GLIBC_VERSION_LIST ${GLIBC_VERSION})
+  LIST(GET GLIBC_VERSION_LIST 1 GLIBC_VERSION_MINOR)
+  if(GLIBC_VERSION_MINOR LESS 28)
+    target_compile_options(
+      Arrow::Arrow INTERFACE "$<$<COMPILE_LANGUAGE:CXX>:-D_GLIBCXX_USE_CXX11_ABI=0>"
+                             "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=-D_GLIBCXX_USE_CXX11_ABI=0>"
+    )
+  endif()
 
   rapids_export_package(BUILD Arrow cudf-exports)
   rapids_export_package(INSTALL Arrow cudf-exports)
@@ -387,11 +403,19 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
   endif()
 
   include("${rapids-cmake-dir}/export/find_package_root.cmake")
-  rapids_export_find_package_root(BUILD Arrow [=[${CMAKE_CURRENT_LIST_DIR}]=] cudf-exports)
-  if(ENABLE_PARQUET)
-    rapids_export_find_package_root(BUILD Parquet [=[${CMAKE_CURRENT_LIST_DIR}]=] cudf-exports)
-    rapids_export_find_package_root(BUILD ArrowDataset [=[${CMAKE_CURRENT_LIST_DIR}]=] cudf-exports)
-  endif()
+  rapids_export_find_package_root(
+    BUILD Arrow [=[${CMAKE_CURRENT_LIST_DIR}]=] EXPORT_SET cudf-exports
+  )
+  rapids_export_find_package_root(
+    BUILD Parquet [=[${CMAKE_CURRENT_LIST_DIR}]=]
+    EXPORT_SET cudf-exports
+    CONDITION ENABLE_PARQUET
+  )
+  rapids_export_find_package_root(
+    BUILD ArrowDataset [=[${CMAKE_CURRENT_LIST_DIR}]=]
+    EXPORT_SET cudf-exports
+    CONDITION ENABLE_PARQUET
+  )
 
   set(ARROW_LIBRARIES
       "${ARROW_LIBRARIES}"
@@ -403,7 +427,7 @@ if(NOT DEFINED CUDF_VERSION_Arrow)
   set(CUDF_VERSION_Arrow
       # This version must be kept in sync with the libarrow version pinned for builds in
       # dependencies.yaml.
-      12.0.1
+      14.0.1
       CACHE STRING "The version of Arrow to find (or build)"
   )
 endif()
diff --git a/cpp/cmake/thirdparty/get_cufile.cmake b/cpp/cmake/thirdparty/get_cufile.cmake
index c0235eba508..bfdff3a99ff 100644
--- a/cpp/cmake/thirdparty/get_cufile.cmake
+++ b/cpp/cmake/thirdparty/get_cufile.cmake
@@ -21,10 +21,10 @@ function(find_and_configure_cufile)
   if(cuFile_FOUND AND NOT BUILD_SHARED_LIBS)
     include("${rapids-cmake-dir}/export/find_package_file.cmake")
     rapids_export_find_package_file(
-      BUILD "${CUDF_SOURCE_DIR}/cmake/Modules/FindcuFile.cmake" cudf-exports
+      BUILD "${CUDF_SOURCE_DIR}/cmake/Modules/FindcuFile.cmake" EXPORT_SET cudf-exports
     )
     rapids_export_find_package_file(
-      INSTALL "${CUDF_SOURCE_DIR}/cmake/Modules/FindcuFile.cmake" cudf-exports
+      INSTALL "${CUDF_SOURCE_DIR}/cmake/Modules/FindcuFile.cmake" EXPORT_SET cudf-exports
     )
   endif()
 endfunction()
diff --git a/cpp/cmake/thirdparty/get_gtest.cmake b/cpp/cmake/thirdparty/get_gtest.cmake
index 1363f43fae2..cfb219448f1 100644
--- a/cpp/cmake/thirdparty/get_gtest.cmake
+++ b/cpp/cmake/thirdparty/get_gtest.cmake
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -30,7 +30,7 @@ function(find_and_configure_gtest)
 
     include("${rapids-cmake-dir}/export/find_package_root.cmake")
     rapids_export_find_package_root(
-      BUILD GTest [=[${CMAKE_CURRENT_LIST_DIR}]=] cudf-testing-exports
+      BUILD GTest [=[${CMAKE_CURRENT_LIST_DIR}]=] EXPORT_SET cudf-testing-exports
     )
   endif()
 
diff --git a/cpp/cmake/thirdparty/get_kvikio.cmake b/cpp/cmake/thirdparty/get_kvikio.cmake
index e94e024d6c9..20712beec41 100644
--- a/cpp/cmake/thirdparty/get_kvikio.cmake
+++ b/cpp/cmake/thirdparty/get_kvikio.cmake
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -25,10 +25,12 @@ function(find_and_configure_kvikio VERSION)
     OPTIONS "KvikIO_BUILD_EXAMPLES OFF"
   )
 
-  if(KvikIO_BINARY_DIR)
-    include("${rapids-cmake-dir}/export/find_package_root.cmake")
-    rapids_export_find_package_root(BUILD KvikIO "${KvikIO_BINARY_DIR}" cudf-exports)
-  endif()
+  include("${rapids-cmake-dir}/export/find_package_root.cmake")
+  rapids_export_find_package_root(
+    BUILD KvikIO "${KvikIO_BINARY_DIR}"
+    EXPORT_SET cudf-exports
+    CONDITION KvikIO_BINARY_DIR
+  )
 
 endfunction()
 
diff --git a/cpp/cmake/thirdparty/get_libcudacxx.cmake b/cpp/cmake/thirdparty/get_libcudacxx.cmake
index 0e03352c335..285d66287f3 100644
--- a/cpp/cmake/thirdparty/get_libcudacxx.cmake
+++ b/cpp/cmake/thirdparty/get_libcudacxx.cmake
@@ -22,16 +22,14 @@ function(find_and_configure_libcudacxx)
   include(${rapids-cmake-dir}/cpm/libcudacxx.cmake)
   rapids_cpm_libcudacxx(BUILD_EXPORT_SET cudf-exports INSTALL_EXPORT_SET cudf-exports)
 
-  if(libcudacxx_SOURCE_DIR)
-    # Store where CMake can find our custom Thrust install
-    include("${rapids-cmake-dir}/export/find_package_root.cmake")
-    rapids_export_find_package_root(
-      INSTALL
-      libcudacxx
-      [=[${CMAKE_CURRENT_LIST_DIR}/../../../include/libcudf/lib/rapids/cmake/libcudacxx]=]
-      cudf-exports
-    )
-  endif()
+  # Store where CMake can find our custom Thrust install
+  include("${rapids-cmake-dir}/export/find_package_root.cmake")
+  rapids_export_find_package_root(
+    INSTALL libcudacxx
+    [=[${CMAKE_CURRENT_LIST_DIR}/../../../include/libcudf/lib/rapids/cmake/libcudacxx]=]
+    EXPORT_SET cudf-exports
+    CONDITION libcudacxx_SOURCE_DIR
+  )
 endfunction()
 
 find_and_configure_libcudacxx()
diff --git a/cpp/cmake/thirdparty/get_nvbench.cmake b/cpp/cmake/thirdparty/get_nvbench.cmake
index f0642145fa0..bbd22693ba4 100644
--- a/cpp/cmake/thirdparty/get_nvbench.cmake
+++ b/cpp/cmake/thirdparty/get_nvbench.cmake
@@ -21,7 +21,7 @@ function(find_and_configure_nvbench)
   set(cudf_patch_dir "${CMAKE_CURRENT_FUNCTION_LIST_DIR}/patches")
   rapids_cpm_package_override("${cudf_patch_dir}/nvbench_override.json")
 
-  rapids_cpm_nvbench()
+  rapids_cpm_nvbench(BUILD_STATIC)
 
 endfunction()
 
diff --git a/cpp/cmake/thirdparty/get_spdlog.cmake b/cpp/cmake/thirdparty/get_spdlog.cmake
index fff5b84af0d..c0e07d02d94 100644
--- a/cpp/cmake/thirdparty/get_spdlog.cmake
+++ b/cpp/cmake/thirdparty/get_spdlog.cmake
@@ -27,7 +27,9 @@ function(find_and_configure_spdlog)
       NAMESPACE spdlog::
     )
     include("${rapids-cmake-dir}/export/find_package_root.cmake")
-    rapids_export_find_package_root(BUILD spdlog [=[${CMAKE_CURRENT_LIST_DIR}]=] cudf-exports)
+    rapids_export_find_package_root(
+      BUILD spdlog [=[${CMAKE_CURRENT_LIST_DIR}]=] EXPORT_SET cudf-exports
+    )
   endif()
 endfunction()
 
diff --git a/cpp/cmake/thirdparty/get_thrust.cmake b/cpp/cmake/thirdparty/get_thrust.cmake
index 39a9de15fa6..67ed4287d7b 100644
--- a/cpp/cmake/thirdparty/get_thrust.cmake
+++ b/cpp/cmake/thirdparty/get_thrust.cmake
@@ -33,14 +33,13 @@ function(find_and_configure_thrust)
     INSTALL_EXPORT_SET cudf-exports
   )
 
-  if(Thrust_SOURCE_DIR)
-    # Store where CMake can find our custom Thrust install
-    include("${rapids-cmake-dir}/export/find_package_root.cmake")
-    rapids_export_find_package_root(
-      INSTALL Thrust
-      [=[${CMAKE_CURRENT_LIST_DIR}/../../../include/libcudf/lib/rapids/cmake/thrust]=] cudf-exports
-    )
-  endif()
+  # Store where CMake can find our custom Thrust install
+  include("${rapids-cmake-dir}/export/find_package_root.cmake")
+  rapids_export_find_package_root(
+    INSTALL Thrust [=[${CMAKE_CURRENT_LIST_DIR}/../../../include/libcudf/lib/rapids/cmake/thrust]=]
+    EXPORT_SET cudf-exports
+    CONDITION Thrust_SOURCE_DIR
+  )
 endfunction()
 
 find_and_configure_thrust()
diff --git a/cpp/cmake/thirdparty/patches/nvbench_override.json b/cpp/cmake/thirdparty/patches/nvbench_override.json
index 7be868081b6..f85bdb9486c 100644
--- a/cpp/cmake/thirdparty/patches/nvbench_override.json
+++ b/cpp/cmake/thirdparty/patches/nvbench_override.json
@@ -9,8 +9,8 @@
           "fixed_in" : ""
         },
         {
-          "file" : "nvbench/use_existing_fmt.diff",
-          "issue" : "Fix add support for using an existing fmt [https://github.com/NVIDIA/nvbench/pull/125]",
+          "file" : "nvbench/nvml_with_static_builds.diff",
+          "issue" : "Add support for nvml with static nvbench [https://github.com/NVIDIA/nvbench/pull/148]",
           "fixed_in" : ""
         }
       ]
diff --git a/cpp/doxygen/Doxyfile b/cpp/doxygen/Doxyfile
index b072d252881..adefaaa1479 100644
--- a/cpp/doxygen/Doxyfile
+++ b/cpp/doxygen/Doxyfile
@@ -38,7 +38,7 @@ PROJECT_NAME           = libcudf
 # could be handy for archiving the generated documentation or if some version
 # control system is used.
 
-PROJECT_NUMBER         = 23.10.00
+PROJECT_NUMBER         = 23.12.00
 
 # Using the PROJECT_BRIEF tag one can provide an optional one line description
 # for a project that appears at the top of each page and should give viewer a
@@ -2226,7 +2226,7 @@ SKIP_FUNCTION_MACROS   = YES
 # the path). If a tag file is not located in the directory in which doxygen is
 # run, you must also specify the path to the tagfile here.
 
-TAGFILES               = rmm.tag=https://docs.rapids.ai/api/librmm/23.10
+TAGFILES               = rmm.tag=https://docs.rapids.ai/api/librmm/23.12
 
 # When a file name is specified after GENERATE_TAGFILE, doxygen will create a
 # tag file that is based on the input files it reads. See section "Linking to
diff --git a/cpp/examples/README.md b/cpp/examples/README.md
index b2e8dd399d0..7f2b769f4a5 100644
--- a/cpp/examples/README.md
+++ b/cpp/examples/README.md
@@ -7,3 +7,4 @@ Current examples:
 
 - Basic: demonstrates a basic use case with libcudf and building a custom application with libcudf
 - Strings: demonstrates using libcudf for accessing and creating strings columns and for building custom kernels for strings
+- Nested Types: demonstrates using libcudf for some operations on nested types
diff --git a/cpp/examples/basic/CMakeLists.txt b/cpp/examples/basic/CMakeLists.txt
index 1c1952c4616..759a43b5627 100644
--- a/cpp/examples/basic/CMakeLists.txt
+++ b/cpp/examples/basic/CMakeLists.txt
@@ -8,23 +8,7 @@ project(
   LANGUAGES CXX CUDA
 )
 
-set(CPM_DOWNLOAD_VERSION v0.35.3)
-file(
-  DOWNLOAD
-  https://github.com/cpm-cmake/CPM.cmake/releases/download/${CPM_DOWNLOAD_VERSION}/get_cpm.cmake
-  ${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake
-)
-include(${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake)
-
-set(CUDF_TAG branch-23.10)
-CPMFindPackage(
-  NAME cudf GIT_REPOSITORY https://github.com/rapidsai/cudf
-  GIT_TAG ${CUDF_TAG}
-  GIT_SHALLOW
-    TRUE
-    SOURCE_SUBDIR
-    cpp
-)
+include(../fetch_dependencies.cmake)
 
 # Configure your project here
 add_executable(basic_example src/process_csv.cpp)
diff --git a/cpp/examples/build.sh b/cpp/examples/build.sh
index 7d389cd318d..001cdeec694 100755
--- a/cpp/examples/build.sh
+++ b/cpp/examples/build.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.
 
 # libcudf examples build script
 
@@ -14,18 +14,17 @@ LIB_BUILD_DIR=${LIB_BUILD_DIR:-$(readlink -f "${EXAMPLES_DIR}/../build")}
 ################################################################################
 # Add individual libcudf examples build scripts down below
 
-# Basic example
-BASIC_EXAMPLE_DIR=${EXAMPLES_DIR}/basic
-BASIC_EXAMPLE_BUILD_DIR=${BASIC_EXAMPLE_DIR}/build
-# Configure
-cmake -S ${BASIC_EXAMPLE_DIR} -B ${BASIC_EXAMPLE_BUILD_DIR} -Dcudf_ROOT="${LIB_BUILD_DIR}"
-# Build
-cmake --build ${BASIC_EXAMPLE_BUILD_DIR} -j${PARALLEL_LEVEL}
-
-# Strings example
-STRINGS_EXAMPLE_DIR=${EXAMPLES_DIR}/strings
-STRINGS_EXAMPLE_BUILD_DIR=${STRINGS_EXAMPLE_DIR}/build
-# Configure
-cmake -S ${STRINGS_EXAMPLE_DIR} -B ${STRINGS_EXAMPLE_BUILD_DIR} -Dcudf_ROOT="${LIB_BUILD_DIR}"
-# Build
-cmake --build ${STRINGS_EXAMPLE_BUILD_DIR} -j${PARALLEL_LEVEL}
+build_example() {
+  example_dir=${1}
+  example_dir="${EXAMPLES_DIR}/${example_dir}"
+  build_dir="${example_dir}/build"
+
+  # Configure
+  cmake -S ${example_dir} -B ${build_dir} -Dcudf_ROOT="${LIB_BUILD_DIR}"
+  # Build
+  cmake --build ${build_dir} -j${PARALLEL_LEVEL}
+}
+
+build_example basic
+build_example strings
+build_example nested_types
diff --git a/cpp/examples/fetch_dependencies.cmake b/cpp/examples/fetch_dependencies.cmake
new file mode 100644
index 00000000000..dc86c6a9aa5
--- /dev/null
+++ b/cpp/examples/fetch_dependencies.cmake
@@ -0,0 +1,30 @@
+# =============================================================================
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+set(CPM_DOWNLOAD_VERSION v0.35.3)
+file(
+  DOWNLOAD
+  https://github.com/cpm-cmake/CPM.cmake/releases/download/${CPM_DOWNLOAD_VERSION}/get_cpm.cmake
+  ${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake
+)
+include(${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake)
+
+set(CUDF_TAG branch-23.12)
+CPMFindPackage(
+  NAME cudf GIT_REPOSITORY https://github.com/rapidsai/cudf
+  GIT_TAG ${CUDF_TAG}
+  GIT_SHALLOW
+    TRUE
+    SOURCE_SUBDIR
+    cpp
+)
diff --git a/cpp/examples/nested_types/CMakeLists.txt b/cpp/examples/nested_types/CMakeLists.txt
new file mode 100644
index 00000000000..cb9430db237
--- /dev/null
+++ b/cpp/examples/nested_types/CMakeLists.txt
@@ -0,0 +1,16 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+cmake_minimum_required(VERSION 3.26.4)
+
+project(
+  nested_types
+  VERSION 0.0.1
+  LANGUAGES CXX CUDA
+)
+
+include(../fetch_dependencies.cmake)
+
+# Configure your project here
+add_executable(deduplication deduplication.cpp)
+target_link_libraries(deduplication PRIVATE cudf::cudf)
+target_compile_features(deduplication PRIVATE cxx_std_17)
diff --git a/cpp/examples/nested_types/deduplication.cpp b/cpp/examples/nested_types/deduplication.cpp
new file mode 100644
index 00000000000..5969985cc72
--- /dev/null
+++ b/cpp/examples/nested_types/deduplication.cpp
@@ -0,0 +1,209 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column_factories.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/copying.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/io/json.hpp>
+#include <cudf/io/types.hpp>
+#include <cudf/join.hpp>
+#include <cudf/sorting.hpp>
+#include <cudf/stream_compaction.hpp>
+#include <cudf/table/table_view.hpp>
+
+#include <rmm/mr/device/cuda_memory_resource.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/mr/device/owning_wrapper.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
+
+#include <chrono>
+#include <iostream>
+#include <string>
+
+/**
+ * @file deduplication.cpp
+ * @brief Demonstrates usage of the libcudf APIs to perform operations on nested-type tables.
+ *
+ * The algorithms chosen to be demonstrated are to showcase nested-type row operators of three
+ * kinds:
+ * 1. hashing: Used by functions `count_aggregate` and `join_count` to hash inputs of any type
+ * 2. equality: Used by functions `count_aggregate` and `join_count` in conjunction with hashing
+ * to determine equality for nested types
+ * 3. lexicographic: Used by function `sort_keys` to create a lexicographical order for nested-types
+ * so as to enable sorting
+ *
+ */
+
+/**
+ * @brief Create memory resource for libcudf functions
+ *
+ * @param pool Whether to use a pool memory resource.
+ * @return Memory resource instance
+ */
+std::shared_ptr<rmm::mr::device_memory_resource> create_memory_resource(bool pool)
+{
+  auto cuda_mr = std::make_shared<rmm::mr::cuda_memory_resource>();
+  if (pool) { return rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(cuda_mr); }
+  return cuda_mr;
+}
+
+/**
+ * @brief Read JSON input from file
+ *
+ * @param filepath path to input JSON file
+ * @return cudf::io::table_with_metadata
+ */
+cudf::io::table_with_metadata read_json(std::string filepath)
+{
+  auto source_info = cudf::io::source_info(filepath);
+  auto builder     = cudf::io::json_reader_options::builder(source_info).lines(true);
+  auto options     = builder.build();
+  return cudf::io::read_json(options);
+}
+
+/**
+ * @brief Write JSON output to file
+ *
+ * @param input table to write
+ * @param metadata metadata of input table read by JSON reader
+ * @param filepath path to output JSON file
+ */
+void write_json(cudf::table_view input, cudf::io::table_metadata metadata, std::string filepath)
+{
+  // write the data for inspection
+  auto sink_info = cudf::io::sink_info(filepath);
+  auto builder   = cudf::io::json_writer_options::builder(sink_info, input).lines(true);
+  builder.metadata(metadata);
+  auto options = builder.build();
+  cudf::io::write_json(options);
+}
+
+/**
+ * @brief Aggregate count of duplicate rows in nested-type column
+ *
+ * @param input table to aggregate
+ * @return std::unique_ptr<cudf::table>
+ */
+std::unique_ptr<cudf::table> count_aggregate(cudf::table_view input)
+{
+  // Get count for each key
+  auto keys = cudf::table_view{{input.column(0)}};
+  auto val  = cudf::make_numeric_column(cudf::data_type{cudf::type_id::INT32}, keys.num_rows());
+
+  cudf::groupby::groupby grpby_obj(keys);
+  std::vector<cudf::groupby::aggregation_request> requests;
+  requests.emplace_back(cudf::groupby::aggregation_request());
+  auto agg = cudf::make_count_aggregation<cudf::groupby_aggregation>();
+  requests[0].aggregations.push_back(std::move(agg));
+  requests[0].values = *val;
+  auto agg_results   = grpby_obj.aggregate(requests);
+  auto result_key    = std::move(agg_results.first);
+  auto result_val    = std::move(agg_results.second[0].results[0]);
+
+  auto left_cols = result_key->release();
+  left_cols.push_back(std::move(result_val));
+
+  return std::make_unique<cudf::table>(std::move(left_cols));
+}
+
+/**
+ * @brief Join each row with its duplicate counts
+ *
+ * @param left left table
+ * @param right right table
+ * @return std::unique_ptr<cudf::table>
+ */
+std::unique_ptr<cudf::table> join_count(cudf::table_view left, cudf::table_view right)
+{
+  auto [left_indices, right_indices] =
+    cudf::inner_join(cudf::table_view{{left.column(0)}}, cudf::table_view{{right.column(0)}});
+  auto new_left  = cudf::gather(left, cudf::device_span<cudf::size_type const>{*left_indices});
+  auto new_right = cudf::gather(right, cudf::device_span<cudf::size_type const>{*right_indices});
+
+  auto left_cols  = new_left->release();
+  auto right_cols = new_right->release();
+  left_cols.push_back(std::move(right_cols[1]));
+
+  return std::make_unique<cudf::table>(std::move(left_cols));
+}
+
+/**
+ * @brief Sort nested-type column
+ *
+ * @param input table to sort
+ * @return std::unique_ptr<cudf::table>
+ *
+ * @note if stability is desired, use `cudf::stable_sorted_order`
+ */
+std::unique_ptr<cudf::table> sort_keys(cudf::table_view input)
+{
+  auto sort_order = cudf::sorted_order(cudf::table_view{{input.column(0)}});
+  return cudf::gather(input, *sort_order);
+}
+
+/**
+ * @brief Main for nested_types examples
+ *
+ * Command line parameters:
+ * 1. JSON input file name/path (default: "example.json")
+ * 2. JSON output file name/path (default: "output.json")
+ * 3. Memory resource (optional): "pool" or "cuda" (default: "pool")
+ *
+ * Example invocation from directory `cudf/cpp/examples/nested_types`:
+ * ./build/deduplication example.json output.json pool
+ *
+ */
+int main(int argc, char const** argv)
+{
+  std::string input_filepath;
+  std::string output_filepath;
+  std::string mr_name;
+  if (argc != 4 && argc != 1) {
+    std::cout << "Either provide all command-line arguments, or none to use defaults" << std::endl;
+    return 1;
+  }
+  if (argc == 1) {
+    input_filepath  = "example.json";
+    output_filepath = "output.json";
+    mr_name         = "pool";
+  } else {
+    input_filepath  = argv[1];
+    output_filepath = argv[2];
+    mr_name         = argv[3];
+  }
+
+  auto pool     = mr_name == "pool";
+  auto resource = create_memory_resource(pool);
+  rmm::mr::set_current_device_resource(resource.get());
+
+  std::cout << "Reading " << input_filepath << "..." << std::endl;
+  // read input file
+  auto [input, metadata] = read_json(input_filepath);
+
+  auto count = count_aggregate(input->view());
+
+  auto combined = join_count(input->view(), count->view());
+
+  auto sorted = sort_keys(combined->view());
+
+  metadata.schema_info.emplace_back("count");
+
+  std::cout << "Writing " << output_filepath << "..." << std::endl;
+  write_json(sorted->view(), metadata, output_filepath);
+
+  return 0;
+}
diff --git a/cpp/examples/nested_types/example.json b/cpp/examples/nested_types/example.json
new file mode 100644
index 00000000000..efaa37817d6
--- /dev/null
+++ b/cpp/examples/nested_types/example.json
@@ -0,0 +1,5 @@
+{"features": {"key": "a1", "values": [{"info": "message_1", "type": "device_a", "dt": 1688750001}]}, "source": "network_a", "quality": 0.7}
+{"features": {"key": "a2", "values": [{"info": "message_2", "type": "device_a", "dt": 1688750002}]}, "source": "network_a", "quality": 0.7}
+{"features": {"key": "a3", "values": [{"info": "message_3", "type": "device_a", "dt": 1688750003}]}, "source": "network_b", "quality": 0.8}
+{"features": {"key": "a1", "values": [{"info": "message_1", "type": "device_a", "dt": 1688750001}]}, "source": "network_b", "quality": 0.9}
+{"features": {"key": "a4", "values": [{"info": "message_4", "type": "device_a", "dt": 1688750004}]}, "source": "network_b", "quality": 0.9}
diff --git a/cpp/examples/strings/CMakeLists.txt b/cpp/examples/strings/CMakeLists.txt
index 31a6b12a4bc..c90fa9dde16 100644
--- a/cpp/examples/strings/CMakeLists.txt
+++ b/cpp/examples/strings/CMakeLists.txt
@@ -8,23 +8,7 @@ project(
   LANGUAGES CXX CUDA
 )
 
-set(CPM_DOWNLOAD_VERSION v0.35.3)
-file(
-  DOWNLOAD
-  https://github.com/cpm-cmake/CPM.cmake/releases/download/${CPM_DOWNLOAD_VERSION}/get_cpm.cmake
-  ${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake
-)
-include(${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake)
-
-set(CUDF_TAG branch-23.10)
-CPMFindPackage(
-  NAME cudf GIT_REPOSITORY https://github.com/rapidsai/cudf
-  GIT_TAG ${CUDF_TAG}
-  GIT_SHALLOW
-    TRUE
-    SOURCE_SUBDIR
-    cpp
-)
+include(../fetch_dependencies.cmake)
 
 list(APPEND CUDF_CUDA_FLAGS --expt-extended-lambda --expt-relaxed-constexpr)
 
diff --git a/cpp/include/cudf/ast/detail/expression_parser.hpp b/cpp/include/cudf/ast/detail/expression_parser.hpp
index db0abe435b0..a36a831a7aa 100644
--- a/cpp/include/cudf/ast/detail/expression_parser.hpp
+++ b/cpp/include/cudf/ast/detail/expression_parser.hpp
@@ -67,8 +67,8 @@ struct alignas(8) device_data_reference {
 
   bool operator==(device_data_reference const& rhs) const
   {
-    return std::tie(data_index, reference_type, table_source) ==
-           std::tie(rhs.data_index, rhs.reference_type, rhs.table_source);
+    return std::tie(data_index, data_type, reference_type, table_source) ==
+           std::tie(rhs.data_index, rhs.data_type, rhs.reference_type, rhs.table_source);
   }
 };
 
diff --git a/cpp/include/cudf/binaryop.hpp b/cpp/include/cudf/binaryop.hpp
index 77d6a4d1e89..9df4b4eb00f 100644
--- a/cpp/include/cudf/binaryop.hpp
+++ b/cpp/include/cudf/binaryop.hpp
@@ -102,6 +102,7 @@ enum class binary_operator : int32_t {
  * @param rhs         The right operand column
  * @param op          The binary operator
  * @param output_type The desired data type of the output column
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr          Device memory resource used to allocate the returned column's device memory
  * @return            Output column of `output_type` type containing the result of
  *                    the binary operation
@@ -115,6 +116,7 @@ std::unique_ptr<column> binary_operation(
   column_view const& rhs,
   binary_operator op,
   data_type output_type,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -131,6 +133,7 @@ std::unique_ptr<column> binary_operation(
  * @param rhs         The right operand scalar
  * @param op          The binary operator
  * @param output_type The desired data type of the output column
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr          Device memory resource used to allocate the returned column's device memory
  * @return            Output column of `output_type` type containing the result of
  *                    the binary operation
@@ -144,6 +147,7 @@ std::unique_ptr<column> binary_operation(
   scalar const& rhs,
   binary_operator op,
   data_type output_type,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -158,6 +162,7 @@ std::unique_ptr<column> binary_operation(
  * @param rhs         The right operand column
  * @param op          The binary operator
  * @param output_type The desired data type of the output column
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr          Device memory resource used to allocate the returned column's device memory
  * @return            Output column of `output_type` type containing the result of
  *                    the binary operation
@@ -172,6 +177,7 @@ std::unique_ptr<column> binary_operation(
   column_view const& rhs,
   binary_operator op,
   data_type output_type,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -189,6 +195,7 @@ std::unique_ptr<column> binary_operation(
  * @param output_type The desired data type of the output column. It is assumed
  *                    that output_type is compatible with the output data type
  *                    of the function in the PTX code
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr          Device memory resource used to allocate the returned column's device memory
  * @return            Output column of `output_type` type containing the result of
  *                    the binary operation
@@ -201,6 +208,7 @@ std::unique_ptr<column> binary_operation(
   column_view const& rhs,
   std::string const& ptx,
   data_type output_type,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh
index 35851a99822..b1ff0bbaea7 100644
--- a/cpp/include/cudf/column/column_device_view.cuh
+++ b/cpp/include/cudf/column/column_device_view.cuh
@@ -16,6 +16,7 @@
 #pragma once
 
 #include <cudf/column/column_view.hpp>
+#include <cudf/detail/offsets_iterator.cuh>
 #include <cudf/detail/utilities/alignment.hpp>
 #include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/lists/list_view.hpp>
@@ -442,10 +443,11 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
   __device__ T element(size_type element_index) const noexcept
   {
     size_type index       = element_index + offset();  // account for this view's _offset
-    auto const* d_offsets = d_children[strings_column_view::offsets_column_index].data<int32_t>();
     char const* d_strings = d_children[strings_column_view::chars_column_index].data<char>();
-    size_type offset      = d_offsets[index];
-    return string_view{d_strings + offset, d_offsets[index + 1] - offset};
+    auto const offsets    = d_children[strings_column_view::offsets_column_index];
+    auto const itr        = cudf::detail::input_offsetalator(offsets.head(), offsets.type());
+    auto const offset     = itr[index];
+    return string_view{d_strings + offset, static_cast<cudf::size_type>(itr[index + 1] - offset)};
   }
 
  private:
diff --git a/cpp/include/cudf/detail/gather.cuh b/cpp/include/cudf/detail/gather.cuh
index 955f9914632..c9975ef2199 100644
--- a/cpp/include/cudf/detail/gather.cuh
+++ b/cpp/include/cudf/detail/gather.cuh
@@ -673,14 +673,20 @@ std::unique_ptr<table> gather(table_view const& source_table,
                                                    mr));
   }
 
-  auto const nullable = bounds_policy == out_of_bounds_policy::NULLIFY ||
-                        std::any_of(source_table.begin(), source_table.end(), [](auto const& col) {
-                          return col.nullable();
-                        });
-  if (nullable) {
-    auto const op = bounds_policy == out_of_bounds_policy::NULLIFY ? gather_bitmask_op::NULLIFY
-                                                                   : gather_bitmask_op::DONT_CHECK;
-    gather_bitmask(source_table, gather_map_begin, destination_columns, op, stream, mr);
+  auto needs_new_bitmask = bounds_policy == out_of_bounds_policy::NULLIFY ||
+                           cudf::has_nested_nullable_columns(source_table);
+  if (needs_new_bitmask) {
+    needs_new_bitmask = needs_new_bitmask || cudf::has_nested_nulls(source_table);
+    if (needs_new_bitmask) {
+      auto const op = bounds_policy == out_of_bounds_policy::NULLIFY
+                        ? gather_bitmask_op::NULLIFY
+                        : gather_bitmask_op::DONT_CHECK;
+      gather_bitmask(source_table, gather_map_begin, destination_columns, op, stream, mr);
+    } else {
+      for (size_type i = 0; i < source_table.num_columns(); ++i) {
+        set_all_valid_null_masks(source_table.column(i), *destination_columns[i], stream, mr);
+      }
+    }
   }
 
   return std::make_unique<table>(std::move(destination_columns));
diff --git a/cpp/include/cudf/detail/indexalator.cuh b/cpp/include/cudf/detail/indexalator.cuh
index 6532dae3695..4d261c54b29 100644
--- a/cpp/include/cudf/detail/indexalator.cuh
+++ b/cpp/include/cudf/detail/indexalator.cuh
@@ -56,10 +56,69 @@ namespace detail {
  *  auto result = thrust::find(thrust::device, begin, end, size_type{12} );
  * @endcode
  */
-using input_indexalator = input_normalator<cudf::size_type>;
+struct input_indexalator : base_normalator<input_indexalator, cudf::size_type> {
+  friend struct base_normalator<input_indexalator, cudf::size_type>;  // for CRTP
+
+  using reference = cudf::size_type const;  // this keeps STL and thrust happy
+
+  input_indexalator()                                    = default;
+  input_indexalator(input_indexalator const&)            = default;
+  input_indexalator(input_indexalator&&)                 = default;
+  input_indexalator& operator=(input_indexalator const&) = default;
+  input_indexalator& operator=(input_indexalator&&)      = default;
+
+  /**
+   * @brief Indirection operator returns the value at the current iterator position
+   */
+  __device__ inline cudf::size_type operator*() const { return operator[](0); }
+
+  /**
+   * @brief Dispatch functor for resolving a Integer value from any integer type
+   */
+  struct normalize_type {
+    template <typename T, CUDF_ENABLE_IF(cudf::is_index_type<T>())>
+    __device__ cudf::size_type operator()(void const* tp)
+    {
+      return static_cast<cudf::size_type>(*static_cast<T const*>(tp));
+    }
+    template <typename T, CUDF_ENABLE_IF(not cudf::is_index_type<T>())>
+    __device__ cudf::size_type operator()(void const*)
+    {
+      CUDF_UNREACHABLE("only integral types are supported");
+    }
+  };
+
+  /**
+   * @brief Array subscript operator returns a value at the input
+   * `idx` position as a `Integer` value.
+   */
+  __device__ inline cudf::size_type operator[](size_type idx) const
+  {
+    void const* tp = p_ + (idx * this->width_);
+    return type_dispatcher(this->dtype_, normalize_type{}, tp);
+  }
+
+  /**
+   * @brief Create an input index normalizing iterator
+   *
+   * Use the indexalator_factory to create an iterator instance.
+   *
+   * @param data   Pointer to an integer array in device memory.
+   * @param dtype  Type of data in data
+   * @param offset Applied to the data pointer per size of the type
+   */
+  CUDF_HOST_DEVICE input_indexalator(void const* data, data_type dtype, cudf::size_type offset = 0)
+    : base_normalator<input_indexalator, cudf::size_type>(dtype), p_{static_cast<char const*>(data)}
+  {
+    p_ += offset * this->width_;
+  }
+
+ protected:
+  char const* p_;  /// pointer to the integer data in device memory
+};
 
 /**
- * @brief The index normalizing output iterator.
+ * @brief The index normalizing output iterator
  *
  * This is an iterator that can be used for index types (integers) without
  * requiring a type-specific instance. It can be used for any iterator
@@ -82,7 +141,75 @@ using input_indexalator = input_normalator<cudf::size_type>;
  *                      thrust::less<Element>());
  * @endcode
  */
-using output_indexalator = output_normalator<cudf::size_type>;
+struct output_indexalator : base_normalator<output_indexalator, cudf::size_type> {
+  friend struct base_normalator<output_indexalator, cudf::size_type>;  // for CRTP
+
+  using reference = output_indexalator const&;  // required for output iterators
+
+  output_indexalator()                                     = default;
+  output_indexalator(output_indexalator const&)            = default;
+  output_indexalator(output_indexalator&&)                 = default;
+  output_indexalator& operator=(output_indexalator const&) = default;
+  output_indexalator& operator=(output_indexalator&&)      = default;
+
+  /**
+   * @brief Indirection operator returns this iterator instance in order
+   * to capture the `operator=(Integer)` calls.
+   */
+  __device__ inline reference operator*() const { return *this; }
+
+  /**
+   * @brief Array subscript operator returns an iterator instance at the specified `idx` position.
+   *
+   * This allows capturing the subsequent `operator=(Integer)` call in this class.
+   */
+  __device__ inline output_indexalator const operator[](size_type idx) const
+  {
+    output_indexalator tmp{*this};
+    tmp.p_ += (idx * this->width_);
+    return tmp;
+  }
+
+  /**
+   * @brief Dispatch functor for setting the index value from a size_type value.
+   */
+  struct normalize_type {
+    template <typename T, CUDF_ENABLE_IF(cudf::is_index_type<T>())>
+    __device__ void operator()(void* tp, cudf::size_type const value)
+    {
+      (*static_cast<T*>(tp)) = static_cast<T>(value);
+    }
+    template <typename T, CUDF_ENABLE_IF(not cudf::is_index_type<T>())>
+    __device__ void operator()(void*, cudf::size_type const)
+    {
+      CUDF_UNREACHABLE("only index types are supported");
+    }
+  };
+
+  /**
+   * @brief Assign an Integer value to the current iterator position
+   */
+  __device__ inline reference operator=(cudf::size_type const value) const
+  {
+    void* tp = p_;
+    type_dispatcher(this->dtype_, normalize_type{}, tp, value);
+    return *this;
+  }
+
+  /**
+   * @brief Create an output normalizing iterator
+   *
+   * @param data      Pointer to an integer array in device memory.
+   * @param dtype Type of data in data
+   */
+  CUDF_HOST_DEVICE output_indexalator(void* data, data_type dtype)
+    : base_normalator<output_indexalator, cudf::size_type>(dtype), p_{static_cast<char*>(data)}
+  {
+  }
+
+ protected:
+  char* p_;  /// pointer to the integer data in device memory
+};
 
 /**
  * @brief Use this class to create an indexalator instance.
@@ -92,14 +219,12 @@ struct indexalator_factory {
    * @brief A type_dispatcher functor to create an input iterator from an indices column.
    */
   struct input_indexalator_fn {
-    template <typename IndexType, std::enable_if_t<is_index_type<IndexType>()>* = nullptr>
+    template <typename IndexType, CUDF_ENABLE_IF(is_index_type<IndexType>())>
     input_indexalator operator()(column_view const& indices)
     {
       return input_indexalator(indices.data<IndexType>(), indices.type());
     }
-    template <typename IndexType,
-              typename... Args,
-              std::enable_if_t<not is_index_type<IndexType>()>* = nullptr>
+    template <typename IndexType, typename... Args, CUDF_ENABLE_IF(not is_index_type<IndexType>())>
     input_indexalator operator()(Args&&... args)
     {
       CUDF_FAIL("indices must be an index type");
@@ -110,16 +235,14 @@ struct indexalator_factory {
    * @brief Use this class to create an indexalator to a scalar index.
    */
   struct input_indexalator_scalar_fn {
-    template <typename IndexType, std::enable_if_t<is_index_type<IndexType>()>* = nullptr>
+    template <typename IndexType, CUDF_ENABLE_IF(is_index_type<IndexType>())>
     input_indexalator operator()(scalar const& index)
     {
       // note: using static_cast<scalar_type_t<IndexType> const&>(index) creates a copy
       auto const scalar_impl = static_cast<scalar_type_t<IndexType> const*>(&index);
       return input_indexalator(scalar_impl->data(), index.type());
     }
-    template <typename IndexType,
-              typename... Args,
-              std::enable_if_t<not is_index_type<IndexType>()>* = nullptr>
+    template <typename IndexType, typename... Args, CUDF_ENABLE_IF(not is_index_type<IndexType>())>
     input_indexalator operator()(Args&&... args)
     {
       CUDF_FAIL("scalar must be an index type");
@@ -130,14 +253,12 @@ struct indexalator_factory {
    * @brief A type_dispatcher functor to create an output iterator from an indices column.
    */
   struct output_indexalator_fn {
-    template <typename IndexType, std::enable_if_t<is_index_type<IndexType>()>* = nullptr>
+    template <typename IndexType, CUDF_ENABLE_IF(is_index_type<IndexType>())>
     output_indexalator operator()(mutable_column_view const& indices)
     {
       return output_indexalator(indices.data<IndexType>(), indices.type());
     }
-    template <typename IndexType,
-              typename... Args,
-              std::enable_if_t<not is_index_type<IndexType>()>* = nullptr>
+    template <typename IndexType, typename... Args, CUDF_ENABLE_IF(not is_index_type<IndexType>())>
     output_indexalator operator()(Args&&... args)
     {
       CUDF_FAIL("indices must be an index type");
diff --git a/cpp/include/cudf/detail/interop.hpp b/cpp/include/cudf/detail/interop.hpp
index 44024333239..8124471982d 100644
--- a/cpp/include/cudf/detail/interop.hpp
+++ b/cpp/include/cudf/detail/interop.hpp
@@ -194,5 +194,18 @@ std::unique_ptr<table> from_arrow(arrow::Table const& input_table,
 std::unique_ptr<cudf::scalar> from_arrow(arrow::Scalar const& input,
                                          rmm::cuda_stream_view stream,
                                          rmm::mr::device_memory_resource* mr);
+
+/**
+ * @brief Return a maximum precision for a given type.
+ *
+ * @tparam T the type to get the maximum precision for
+ */
+template <typename T>
+constexpr std::size_t max_precision()
+{
+  auto constexpr num_bits = sizeof(T) * 8;
+  return std::floor(num_bits * std::log(2) / std::log(10));
+}
+
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/merge.cuh b/cpp/include/cudf/detail/merge.cuh
deleted file mode 100644
index e8e9b080a92..00000000000
--- a/cpp/include/cudf/detail/merge.cuh
+++ /dev/null
@@ -1,166 +0,0 @@
-/*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <cudf/table/row_operators.cuh>
-#include <cudf/utilities/type_dispatcher.hpp>
-
-#include <thrust/merge.h>
-#include <thrust/pair.h>
-#include <thrust/tuple.h>
-
-namespace cudf {
-namespace detail {
-/**
- * @brief Source table identifier to copy data from.
- */
-enum class side : bool { LEFT, RIGHT };
-
-/**
- * @brief Tagged index type: `thrust::get<0>` indicates left/right side,
- * `thrust::get<1>` indicates the row index
- */
-using index_type = thrust::pair<side, cudf::size_type>;
-
-/**
- * @brief Vector of `index_type` values.
- */
-using index_vector = rmm::device_uvector<index_type>;
-
-/**
- * @brief tagged_element_relational_comparator uses element_relational_comparator to provide
- * "tagged-index" comparison logic.
- *
- * Special treatment is necessary in several thrust algorithms (e.g., merge()) where
- * the index affinity to the side is not guaranteed; i.e., the algorithms rely on
- * binary functors (predicates) where the operands may transparently switch sides.
- *
- * For example,
- *         thrust::merge(left_container,
- *                       right_container,
- *                       predicate(lhs, rhs){...});
- *         can create 4 different use-cases, inside predicate(...):
- *
- *         1. lhs refers to the left container; rhs to the right container;
- *         2. vice-versa;
- *         3. both lhs and rhs actually refer to the left container;
- *         4. both lhs and rhs actually refer to the right container;
- *
- * Because of that, one cannot rely on the predicate having *fixed* references to the containers.
- * Each invocation may land in a different situation (among the 4 above) than any other invocation.
- * Also, one cannot just manipulate lhs, rhs (indices) alone; because, if predicate always applies
- * one index to one container and the other index to the other container,
- * switching the indices alone won't suffice in the cases (3) or (4),
- * where the also the containers must be changed (to just one instead of two)
- * independently of indices;
- *
- * As a result, a special comparison logic is necessary whereby the index is "tagged" with side
- * information and consequently comparator functors (predicates) must operate on these tagged
- * indices rather than on raw indices.
- */
-template <bool has_nulls = true>
-struct tagged_element_relational_comparator {
-  __host__ __device__ tagged_element_relational_comparator(column_device_view lhs,
-                                                           column_device_view rhs,
-                                                           null_order null_precedence)
-    : lhs{lhs}, rhs{rhs}, null_precedence{null_precedence}
-  {
-  }
-
-  [[nodiscard]] __device__ weak_ordering compare(index_type lhs_tagged_index,
-                                                 index_type rhs_tagged_index) const noexcept
-  {
-    auto const [l_side, l_indx] = lhs_tagged_index;
-    auto const [r_side, r_indx] = rhs_tagged_index;
-
-    column_device_view const* ptr_left_dview{l_side == side::LEFT ? &lhs : &rhs};
-    column_device_view const* ptr_right_dview{r_side == side::LEFT ? &lhs : &rhs};
-
-    auto erl_comparator = element_relational_comparator(
-      nullate::DYNAMIC{has_nulls}, *ptr_left_dview, *ptr_right_dview, null_precedence);
-
-    return cudf::type_dispatcher(lhs.type(), erl_comparator, l_indx, r_indx);
-  }
-
- private:
-  column_device_view lhs;
-  column_device_view rhs;
-  null_order null_precedence;
-};
-
-/**
- * @brief The equivalent of `row_lexicographic_comparator` for tagged indices.
- */
-template <bool has_nulls = true>
-struct row_lexicographic_tagged_comparator {
-  row_lexicographic_tagged_comparator(table_device_view lhs,
-                                      table_device_view rhs,
-                                      order const* column_order         = nullptr,
-                                      null_order const* null_precedence = nullptr)
-    : _lhs{lhs}, _rhs{rhs}, _column_order{column_order}, _null_precedence{null_precedence}
-  {
-    // Add check for types to be the same.
-    CUDF_EXPECTS(_lhs.num_columns() == _rhs.num_columns(), "Mismatched number of columns.");
-  }
-
-  __device__ bool operator()(index_type lhs_tagged_index,
-                             index_type rhs_tagged_index) const noexcept
-  {
-    for (size_type i = 0; i < _lhs.num_columns(); ++i) {
-      bool ascending = (_column_order == nullptr) or (_column_order[i] == order::ASCENDING);
-
-      null_order null_precedence =
-        _null_precedence == nullptr ? null_order::BEFORE : _null_precedence[i];
-
-      auto comparator = tagged_element_relational_comparator<has_nulls>{
-        _lhs.column(i), _rhs.column(i), null_precedence};
-
-      weak_ordering state = comparator.compare(lhs_tagged_index, rhs_tagged_index);
-
-      if (state == weak_ordering::EQUIVALENT) { continue; }
-
-      return state == (ascending ? weak_ordering::LESS : weak_ordering::GREATER);
-    }
-    return false;
-  }
-
- private:
-  table_device_view _lhs;
-  table_device_view _rhs;
-  null_order const* _null_precedence{};
-  order const* _column_order{};
-};
-
-/**
- * @copydoc std::unique_ptr<cudf::table> merge(
- *            std::vector<table_view> const& tables_to_merge,
- *            std::vector<cudf::size_type> const& key_cols,
- *            std::vector<cudf::order> const& column_order,
- *            std::vector<cudf::null_order> const& null_precedence,
- *            rmm::mr::device_memory_resource* mr)
- *
- * @param stream CUDA stream used for device memory operations and kernel launches
- */
-std::unique_ptr<cudf::table> merge(std::vector<table_view> const& tables_to_merge,
-                                   std::vector<cudf::size_type> const& key_cols,
-                                   std::vector<cudf::order> const& column_order,
-                                   std::vector<cudf::null_order> const& null_precedence,
-                                   rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr);
-
-}  // namespace detail
-}  // namespace cudf
diff --git a/cpp/include/cudf/detail/merge.hpp b/cpp/include/cudf/detail/merge.hpp
new file mode 100644
index 00000000000..2167a484214
--- /dev/null
+++ b/cpp/include/cudf/detail/merge.hpp
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <rmm/device_uvector.hpp>
+
+#include <thrust/pair.h>
+
+namespace cudf {
+namespace detail {
+
+/**
+ * @brief Source table identifier to copy data from.
+ */
+enum class side : bool { LEFT, RIGHT };
+
+/**
+ * @brief Tagged index type: `thrust::get<0>` indicates left/right side,
+ * `thrust::get<1>` indicates the row index
+ */
+using index_type = thrust::pair<side, cudf::size_type>;
+
+/**
+ * @brief Vector of `index_type` values.
+ */
+using index_vector = rmm::device_uvector<index_type>;
+
+/**
+ * @copydoc std::unique_ptr<cudf::table> merge(
+ *            std::vector<table_view> const& tables_to_merge,
+ *            std::vector<cudf::size_type> const& key_cols,
+ *            std::vector<cudf::order> const& column_order,
+ *            std::vector<cudf::null_order> const& null_precedence,
+ *            rmm::mr::device_memory_resource* mr)
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ */
+std::unique_ptr<cudf::table> merge(std::vector<table_view> const& tables_to_merge,
+                                   std::vector<cudf::size_type> const& key_cols,
+                                   std::vector<cudf::order> const& column_order,
+                                   std::vector<cudf::null_order> const& null_precedence,
+                                   rmm::cuda_stream_view stream,
+                                   rmm::mr::device_memory_resource* mr);
+
+}  // namespace detail
+}  // namespace cudf
diff --git a/cpp/include/cudf/detail/normalizing_iterator.cuh b/cpp/include/cudf/detail/normalizing_iterator.cuh
index 51b3133f84f..8f90afc3e57 100644
--- a/cpp/include/cudf/detail/normalizing_iterator.cuh
+++ b/cpp/include/cudf/detail/normalizing_iterator.cuh
@@ -33,8 +33,8 @@ namespace detail {
  * @tparam Integer The type the iterator normalizes to
  */
 template <class Derived, typename Integer>
-struct base_normalator {
-  static_assert(std::is_integral_v<Integer>);
+struct alignas(16) base_normalator {
+  static_assert(cudf::is_index_type<Integer>());
   using difference_type   = std::ptrdiff_t;
   using value_type        = Integer;
   using pointer           = Integer*;
@@ -202,165 +202,43 @@ struct base_normalator {
     return static_cast<Derived const&>(*this).p_ >= rhs.p_;
   }
 
- protected:
-  /**
-   * @brief Constructor assigns width and type member variables for base class.
-   */
-  explicit base_normalator(data_type dtype) : width_(size_of(dtype)), dtype_(dtype) {}
-
-  int width_;        /// integer type width = 1,2,4, or 8
-  data_type dtype_;  /// for type-dispatcher calls
-};
-
-/**
- * @brief The integer normalizing input iterator
- *
- * This is an iterator that can be used for index types (integers) without
- * requiring a type-specific instance. It can be used for any iterator
- * interface for reading an array of integer values of type
- * int8, int16, int32, int64, uint8, uint16, uint32, or uint64.
- * Reading specific elements always return a type of `Integer`
- *
- * @tparam Integer Type returned by all read functions
- */
-template <typename Integer>
-struct input_normalator : base_normalator<input_normalator<Integer>, Integer> {
-  friend struct base_normalator<input_normalator<Integer>, Integer>;  // for CRTP
-
-  using reference = Integer const;  // this keeps STL and thrust happy
-
-  input_normalator()                                   = default;
-  input_normalator(input_normalator const&)            = default;
-  input_normalator(input_normalator&&)                 = default;
-  input_normalator& operator=(input_normalator const&) = default;
-  input_normalator& operator=(input_normalator&&)      = default;
-
-  /**
-   * @brief Indirection operator returns the value at the current iterator position
-   */
-  __device__ inline Integer operator*() const { return operator[](0); }
-
-  /**
-   * @brief Dispatch functor for resolving a Integer value from any integer type
-   */
-  struct normalize_type {
-    template <typename T, std::enable_if_t<cuda::std::is_integral_v<T>>* = nullptr>
-    __device__ Integer operator()(void const* tp)
-    {
-      return static_cast<Integer>(*static_cast<T const*>(tp));
-    }
-    template <typename T, std::enable_if_t<not cuda::std::is_integral_v<T>>* = nullptr>
-    __device__ Integer operator()(void const*)
+ private:
+  struct integer_sizeof_fn {
+    template <typename T, CUDF_ENABLE_IF(not cudf::is_fixed_width<T>())>
+    CUDF_HOST_DEVICE constexpr std::size_t operator()() const
     {
+#ifndef __CUDA_ARCH__
+      CUDF_FAIL("only integral types are supported");
+#else
       CUDF_UNREACHABLE("only integral types are supported");
+#endif
     }
-  };
-
-  /**
-   * @brief Array subscript operator returns a value at the input
-   * `idx` position as a `Integer` value.
-   */
-  __device__ inline Integer operator[](size_type idx) const
-  {
-    void const* tp = p_ + (idx * this->width_);
-    return type_dispatcher(this->dtype_, normalize_type{}, tp);
-  }
-
-  /**
-   * @brief Create an input index normalizing iterator.
-   *
-   * Use the indexalator_factory to create an iterator instance.
-   *
-   * @param data      Pointer to an integer array in device memory.
-   * @param data_type Type of data in data
-   */
-  input_normalator(void const* data, data_type dtype)
-    : base_normalator<input_normalator<Integer>, Integer>(dtype), p_{static_cast<char const*>(data)}
-  {
-  }
-
-  char const* p_;  /// pointer to the integer data in device memory
-};
-
-/**
- * @brief The integer normalizing output iterator
- *
- * This is an iterator that can be used for index types (integers) without
- * requiring a type-specific instance. It can be used for any iterator
- * interface for writing an array of integer values of type
- * int8, int16, int32, int64, uint8, uint16, uint32, or uint64.
- * Setting specific elements always accept the `Integer` type values.
- *
- * @tparam Integer The type used for all write functions
- */
-template <typename Integer>
-struct output_normalator : base_normalator<output_normalator<Integer>, Integer> {
-  friend struct base_normalator<output_normalator<Integer>, Integer>;  // for CRTP
-
-  using reference = output_normalator const&;  // required for output iterators
-
-  output_normalator()                                    = default;
-  output_normalator(output_normalator const&)            = default;
-  output_normalator(output_normalator&&)                 = default;
-  output_normalator& operator=(output_normalator const&) = default;
-  output_normalator& operator=(output_normalator&&)      = default;
-
-  /**
-   * @brief Indirection operator returns this iterator instance in order
-   * to capture the `operator=(Integer)` calls.
-   */
-  __device__ inline output_normalator const& operator*() const { return *this; }
-
-  /**
-   * @brief Array subscript operator returns an iterator instance at the specified `idx` position.
-   *
-   * This allows capturing the subsequent `operator=(Integer)` call in this class.
-   */
-  __device__ inline output_normalator const operator[](size_type idx) const
-  {
-    output_normalator tmp{*this};
-    tmp.p_ += (idx * this->width_);
-    return tmp;
-  }
-
-  /**
-   * @brief Dispatch functor for setting the index value from a size_type value.
-   */
-  struct normalize_type {
-    template <typename T, std::enable_if_t<std::is_integral_v<T>>* = nullptr>
-    __device__ void operator()(void* tp, Integer const value)
-    {
-      (*static_cast<T*>(tp)) = static_cast<T>(value);
-    }
-    template <typename T, std::enable_if_t<not std::is_integral_v<T>>* = nullptr>
-    __device__ void operator()(void*, Integer const)
+    template <typename T, CUDF_ENABLE_IF(cudf::is_fixed_width<T>())>
+    CUDF_HOST_DEVICE constexpr std::size_t operator()() const noexcept
     {
-      CUDF_UNREACHABLE("only index types are supported");
+      return sizeof(T);
     }
   };
 
+ protected:
   /**
-   * @brief Assign an Integer value to the current iterator position
+   * @brief Constructor assigns width and type member variables for base class.
    */
-  __device__ inline output_normalator const& operator=(Integer const value) const
+  explicit CUDF_HOST_DEVICE base_normalator(data_type dtype) : dtype_(dtype)
   {
-    void* tp = p_;
-    type_dispatcher(this->dtype_, normalize_type{}, tp, value);
-    return *this;
+    width_ = static_cast<int32_t>(type_dispatcher(dtype, integer_sizeof_fn{}));
   }
 
   /**
-   * @brief Create an output normalizing iterator
-   *
-   * @param data      Pointer to an integer array in device memory.
-   * @param data_type Type of data in data
+   * @brief Constructor assigns width and type member variables for base class.
    */
-  output_normalator(void* data, data_type dtype)
-    : base_normalator<output_normalator<Integer>, Integer>(dtype), p_{static_cast<char*>(data)}
+  explicit CUDF_HOST_DEVICE base_normalator(data_type dtype, int32_t width)
+    : width_(width), dtype_(dtype)
   {
   }
 
-  char* p_;  /// pointer to the integer data in device memory
+  int32_t width_;    /// integer type width = 1,2,4, or 8
+  data_type dtype_;  /// for type-dispatcher calls
 };
 
 }  // namespace detail
diff --git a/cpp/include/cudf/detail/null_mask.hpp b/cpp/include/cudf/detail/null_mask.hpp
index 8c10bbe416f..74e2ccd2ea1 100644
--- a/cpp/include/cudf/detail/null_mask.hpp
+++ b/cpp/include/cudf/detail/null_mask.hpp
@@ -15,6 +15,7 @@
  */
 #pragma once
 
+#include <cudf/column/column.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/span.hpp>
@@ -259,6 +260,22 @@ cudf::size_type inplace_bitmask_and(device_span<bitmask_type> dest_mask,
                                     size_type mask_size_bits,
                                     rmm::cuda_stream_view stream);
 
+/**
+ * @brief Recursively set valid null masks for all children.
+ *
+ * This function applies all valid null masks to the output column if input column satisfies
+ * `nullable() == true` condition
+ *
+ * @param input input column to check for nullability
+ * @param output output column to mirror nullability of input
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+void set_all_valid_null_masks(column_view const& input,
+                              column& output,
+                              rmm::cuda_stream_view stream,
+                              rmm::mr::device_memory_resource* mr);
+
 }  // namespace detail
 
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/offsets_iterator.cuh b/cpp/include/cudf/detail/offsets_iterator.cuh
new file mode 100644
index 00000000000..3eb77b32353
--- /dev/null
+++ b/cpp/include/cudf/detail/offsets_iterator.cuh
@@ -0,0 +1,165 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/detail/normalizing_iterator.cuh>
+#include <cudf/types.hpp>
+
+namespace cudf {
+namespace detail {
+
+/**
+ * @brief The offsets normalizing input iterator
+ *
+ * This is an iterator that can be used for offsets where the underlying
+ * type may be int32_t or int64_t.
+ *
+ * Use the offsetalator_factory to create an appropriate input iterator
+ * from an offsets column_view.
+ */
+struct input_offsetalator : base_normalator<input_offsetalator, int64_t> {
+  friend struct base_normalator<input_offsetalator, int64_t>;  // for CRTP
+
+  using reference = int64_t const;  // this keeps STL and thrust happy
+
+  input_offsetalator()                                     = default;
+  input_offsetalator(input_offsetalator const&)            = default;
+  input_offsetalator(input_offsetalator&&)                 = default;
+  input_offsetalator& operator=(input_offsetalator const&) = default;
+  input_offsetalator& operator=(input_offsetalator&&)      = default;
+
+  /**
+   * @brief Indirection operator returns the value at the current iterator position
+   */
+  __device__ inline int64_t operator*() const { return operator[](0); }
+
+  /**
+   * @brief Array subscript operator returns a value at the input
+   * `idx` position as a int64_t value.
+   */
+  __device__ inline int64_t operator[](size_type idx) const
+  {
+    void const* tp = p_ + (idx * this->width_);
+    return this->width_ == sizeof(int32_t) ? static_cast<int64_t>(*static_cast<int32_t const*>(tp))
+                                           : *static_cast<int64_t const*>(tp);
+  }
+
+  /**
+   * @brief Create an input index normalizing iterator.
+   *
+   * Use the indexalator_factory to create an iterator instance.
+   *
+   * @param data      Pointer to an integer array in device memory.
+   * @param dtype Type of data in data
+   */
+  CUDF_HOST_DEVICE input_offsetalator(void const* data, data_type dtype)
+    : base_normalator<input_offsetalator, int64_t>(
+        dtype, dtype.id() == type_id::INT32 ? sizeof(int32_t) : sizeof(int64_t)),
+      p_{static_cast<char const*>(data)}
+  {
+#ifndef __CUDA_ARCH__
+    CUDF_EXPECTS(dtype.id() == type_id::INT32 || dtype.id() == type_id::INT64,
+                 "Unexpected offsets type");
+#else
+    cudf_assert((dtype.id() == type_id::INT32 || dtype.id() == type_id::INT64) &&
+                "Unexpected offsets type");
+#endif
+  }
+
+ protected:
+  char const* p_;  /// pointer to the integer data in device memory
+};
+
+/**
+ * @brief The offsets normalizing output iterator
+ *
+ * This is an iterator that can be used for storing offsets values
+ * where the underlying type may be either int32_t or int64_t.
+ *
+ * Use the offsetalator_factory to create an appropriate output iterator
+ * from a mutable_column_view.
+ *
+ */
+struct output_offsetalator : base_normalator<output_offsetalator, int64_t> {
+  friend struct base_normalator<output_offsetalator, int64_t>;  // for CRTP
+
+  using reference = output_offsetalator const&;  // required for output iterators
+
+  output_offsetalator()                                      = default;
+  output_offsetalator(output_offsetalator const&)            = default;
+  output_offsetalator(output_offsetalator&&)                 = default;
+  output_offsetalator& operator=(output_offsetalator const&) = default;
+  output_offsetalator& operator=(output_offsetalator&&)      = default;
+
+  /**
+   * @brief Indirection operator returns this iterator instance in order
+   * to capture the `operator=(int64)` calls.
+   */
+  __device__ inline output_offsetalator const& operator*() const { return *this; }
+
+  /**
+   * @brief Array subscript operator returns an iterator instance at the specified `idx` position.
+   *
+   * This allows capturing the subsequent `operator=(int64)` call in this class.
+   */
+  __device__ inline output_offsetalator const operator[](size_type idx) const
+  {
+    output_offsetalator tmp{*this};
+    tmp.p_ += (idx * this->width_);
+    return tmp;
+  }
+
+  /**
+   * @brief Assign an offset value to the current iterator position
+   */
+  __device__ inline output_offsetalator const& operator=(int64_t const value) const
+  {
+    void* tp = p_;
+    if (this->width_ == sizeof(int32_t)) {
+      (*static_cast<int32_t*>(tp)) = static_cast<int32_t>(value);
+    } else {
+      (*static_cast<int64_t*>(tp)) = value;
+    }
+    return *this;
+  }
+
+  /**
+   * @brief Create an output offsets iterator
+   *
+   * @param data      Pointer to an integer array in device memory.
+   * @param dtype Type of data in data
+   */
+  CUDF_HOST_DEVICE output_offsetalator(void* data, data_type dtype)
+    : base_normalator<output_offsetalator, int64_t>(
+        dtype, dtype.id() == type_id::INT32 ? sizeof(int32_t) : sizeof(int64_t)),
+      p_{static_cast<char*>(data)}
+  {
+#ifndef __CUDA_ARCH__
+    CUDF_EXPECTS(dtype.id() == type_id::INT32 || dtype.id() == type_id::INT64,
+                 "Unexpected offsets type");
+#else
+    cudf_assert((dtype.id() == type_id::INT32 || dtype.id() == type_id::INT64) &&
+                "Unexpected offsets type");
+#endif
+  }
+
+ protected:
+  char* p_;  /// pointer to the integer data in device memory
+};
+
+}  // namespace detail
+}  // namespace cudf
diff --git a/cpp/include/cudf/detail/offsets_iterator_factory.cuh b/cpp/include/cudf/detail/offsets_iterator_factory.cuh
new file mode 100644
index 00000000000..5b4c6b825d2
--- /dev/null
+++ b/cpp/include/cudf/detail/offsets_iterator_factory.cuh
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/detail/offsets_iterator.cuh>
+
+namespace cudf {
+namespace detail {
+
+/**
+ * @brief Use this class to create an offsetalator instance.
+ */
+struct offsetalator_factory {
+  /**
+   * @brief Create an input offsetalator instance from an offsets column
+   */
+  static input_offsetalator make_input_iterator(column_view const& offsets)
+  {
+    return input_offsetalator(offsets.head(), offsets.type());
+  }
+
+  /**
+   * @brief Create an output offsetalator instance from an offsets column
+   */
+  static output_offsetalator make_output_iterator(mutable_column_view const& offsets)
+  {
+    return output_offsetalator(offsets.head(), offsets.type());
+  }
+};
+
+}  // namespace detail
+}  // namespace cudf
diff --git a/cpp/include/cudf/detail/unary.hpp b/cpp/include/cudf/detail/unary.hpp
index 3fbdf4a5a8f..12f864de572 100644
--- a/cpp/include/cudf/detail/unary.hpp
+++ b/cpp/include/cudf/detail/unary.hpp
@@ -64,8 +64,6 @@ std::unique_ptr<column> true_if(InputIterator begin,
 
 /**
  * @copydoc cudf::unary_operation
- *
- * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> unary_operation(cudf::column_view const& input,
                                               cudf::unary_operator op,
@@ -74,8 +72,6 @@ std::unique_ptr<cudf::column> unary_operation(cudf::column_view const& input,
 
 /**
  * @copydoc cudf::is_valid
- *
- * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> is_valid(cudf::column_view const& input,
                                        rmm::cuda_stream_view stream,
@@ -83,8 +79,6 @@ std::unique_ptr<cudf::column> is_valid(cudf::column_view const& input,
 
 /**
  * @copydoc cudf::cast
- *
- * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> cast(column_view const& input,
                              data_type type,
@@ -93,8 +87,6 @@ std::unique_ptr<column> cast(column_view const& input,
 
 /**
  * @copydoc cudf::is_nan
- *
- * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> is_nan(cudf::column_view const& input,
                                rmm::cuda_stream_view stream,
@@ -102,8 +94,6 @@ std::unique_ptr<column> is_nan(cudf::column_view const& input,
 
 /**
  * @copydoc cudf::is_not_nan
- *
- * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> is_not_nan(cudf::column_view const& input,
                                    rmm::cuda_stream_view stream,
diff --git a/cpp/include/cudf/detail/utilities/pinned_host_vector.hpp b/cpp/include/cudf/detail/utilities/pinned_host_vector.hpp
index 9e2b85ea129..eee974c8399 100644
--- a/cpp/include/cudf/detail/utilities/pinned_host_vector.hpp
+++ b/cpp/include/cudf/detail/utilities/pinned_host_vector.hpp
@@ -169,7 +169,12 @@ class pinned_allocator {
    *        It is the responsibility of the caller to destroy
    *        the objects stored at \p p.
    */
-  __host__ inline void deallocate(pointer p, size_type /*cnt*/) { CUDF_CUDA_TRY(cudaFreeHost(p)); }
+  __host__ inline void deallocate(pointer p, size_type /*cnt*/)
+  {
+    auto dealloc_worked = cudaFreeHost(p);
+    (void)dealloc_worked;
+    assert(dealloc_worked == cudaSuccess);
+  }
 
   /**
    * @brief This method returns the maximum size of the \c cnt parameter
diff --git a/cpp/include/cudf/dictionary/detail/merge.hpp b/cpp/include/cudf/dictionary/detail/merge.hpp
index e7ea53c740a..cad495d0097 100644
--- a/cpp/include/cudf/dictionary/detail/merge.hpp
+++ b/cpp/include/cudf/dictionary/detail/merge.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,7 @@
 #pragma once
 
 #include <cudf/column/column.hpp>
-#include <cudf/detail/merge.cuh>
+#include <cudf/detail/merge.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
diff --git a/cpp/include/cudf/interop.hpp b/cpp/include/cudf/interop.hpp
index 865cc004107..2ee6f19614d 100644
--- a/cpp/include/cudf/interop.hpp
+++ b/cpp/include/cudf/interop.hpp
@@ -129,6 +129,12 @@ struct column_metadata {
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param ar_mr arrow memory pool to allocate memory for arrow Table
  * @return arrow Table generated from `input`
+ *
+ * @note For decimals, since the precision is not stored for them in libcudf,
+ * it will be converted to an Arrow decimal128 that has the widest-precision the cudf decimal type
+ * supports. For example, numeric::decimal32 will be converted to Arrow decimal128 of the precision
+ * 9 which is the maximum precision for 32-bit types. Similarly, numeric::decimal128 will be
+ * converted to Arrow decimal128 of the precision 38.
  */
 std::shared_ptr<arrow::Table> to_arrow(table_view input,
                                        std::vector<column_metadata> const& metadata = {},
@@ -145,6 +151,12 @@ std::shared_ptr<arrow::Table> to_arrow(table_view input,
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param ar_mr arrow memory pool to allocate memory for arrow Scalar
  * @return arrow Scalar generated from `input`
+ *
+ * @note For decimals, since the precision is not stored for them in libcudf,
+ * it will be converted to an Arrow decimal128 that has the widest-precision the cudf decimal type
+ * supports. For example, numeric::decimal32 will be converted to Arrow decimal128 of the precision
+ * 9 which is the maximum precision for 32-bit types. Similarly, numeric::decimal128 will be
+ * converted to Arrow decimal128 of the precision 38.
  */
 std::shared_ptr<arrow::Scalar> to_arrow(cudf::scalar const& input,
                                         column_metadata const& metadata = {},
diff --git a/cpp/include/cudf/io/avro.hpp b/cpp/include/cudf/io/avro.hpp
index 17c168f38d4..89207302850 100644
--- a/cpp/include/cudf/io/avro.hpp
+++ b/cpp/include/cudf/io/avro.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -56,7 +56,7 @@ class avro_reader_options {
    *
    * @param src source information used to read avro file
    */
-  explicit avro_reader_options(source_info const& src) : _source(src) {}
+  explicit avro_reader_options(source_info src) : _source{std::move(src)} {}
 
   friend avro_reader_options_builder;
 
@@ -123,7 +123,7 @@ class avro_reader_options {
    * @param src source information used to read avro file
    * @returns builder to build reader options
    */
-  static avro_reader_options_builder builder(source_info const& src);
+  static avro_reader_options_builder builder(source_info src);
 };
 
 /**
@@ -145,7 +145,7 @@ class avro_reader_options_builder {
    *
    * @param src The source information used to read avro file
    */
-  explicit avro_reader_options_builder(source_info const& src) : options(src) {}
+  explicit avro_reader_options_builder(source_info src) : options{std::move(src)} {}
 
   /**
    * @brief Set names of the column to be read.
diff --git a/cpp/include/cudf/io/csv.hpp b/cpp/include/cudf/io/csv.hpp
index b49a13a8ea9..435583e805d 100644
--- a/cpp/include/cudf/io/csv.hpp
+++ b/cpp/include/cudf/io/csv.hpp
@@ -138,7 +138,7 @@ class csv_reader_options {
    *
    * @param src source information used to read csv file
    */
-  explicit csv_reader_options(source_info const& src) : _source(src) {}
+  explicit csv_reader_options(source_info src) : _source{std::move(src)} {}
 
   friend csv_reader_options_builder;
 
@@ -156,7 +156,7 @@ class csv_reader_options {
    * @param src Source information to read csv file
    * @return Builder to build reader options
    */
-  static csv_reader_options_builder builder(source_info const& src);
+  static csv_reader_options_builder builder(source_info src);
 
   /**
    * @brief Returns source info.
@@ -835,7 +835,7 @@ class csv_reader_options_builder {
    *
    * @param src The source information used to read csv file
    */
-  csv_reader_options_builder(source_info const& src) : options(src) {}
+  csv_reader_options_builder(source_info src) : options{std::move(src)} {}
 
   /**
    * @brief Sets compression format of the source.
@@ -1307,6 +1307,7 @@ class csv_reader_options_builder {
  * @endcode
  *
  * @param options Settings for controlling reading behavior
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate device memory of the table in the returned
  * table_with_metadata
  *
@@ -1314,6 +1315,7 @@ class csv_reader_options_builder {
  */
 table_with_metadata read_csv(
   csv_reader_options options,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
@@ -1715,9 +1717,11 @@ class csv_writer_options_builder {
  * @endcode
  *
  * @param options Settings for controlling writing behavior
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource to use for device memory allocation
  */
 void write_csv(csv_writer_options const& options,
+               rmm::cuda_stream_view stream        = cudf::get_default_stream(),
                rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/include/cudf/io/detail/csv.hpp b/cpp/include/cudf/io/detail/csv.hpp
index 9fdc7a47fb9..40ddcf385b0 100644
--- a/cpp/include/cudf/io/detail/csv.hpp
+++ b/cpp/include/cudf/io/detail/csv.hpp
@@ -17,7 +17,6 @@
 #pragma once
 
 #include <cudf/io/csv.hpp>
-#include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
diff --git a/cpp/include/cudf/io/detail/json.hpp b/cpp/include/cudf/io/detail/json.hpp
index 6930a4fdb25..d0a9543397d 100644
--- a/cpp/include/cudf/io/detail/json.hpp
+++ b/cpp/include/cudf/io/detail/json.hpp
@@ -17,7 +17,6 @@
 #pragma once
 
 #include <cudf/io/json.hpp>
-#include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
diff --git a/cpp/include/cudf/io/detail/parquet.hpp b/cpp/include/cudf/io/detail/parquet.hpp
index 074f690d2c7..0b8ee9676de 100644
--- a/cpp/include/cudf/io/detail/parquet.hpp
+++ b/cpp/include/cudf/io/detail/parquet.hpp
@@ -38,7 +38,7 @@ class parquet_reader_options;
 class parquet_writer_options;
 class chunked_parquet_writer_options;
 
-namespace detail::parquet {
+namespace parquet::detail {
 
 /**
  * @brief Class to read Parquet dataset data into columns.
@@ -186,7 +186,7 @@ class writer {
    */
   explicit writer(std::vector<std::unique_ptr<data_sink>> sinks,
                   parquet_writer_options const& options,
-                  single_write_mode mode,
+                  cudf::io::detail::single_write_mode mode,
                   rmm::cuda_stream_view stream);
 
   /**
@@ -201,7 +201,7 @@ class writer {
    */
   explicit writer(std::vector<std::unique_ptr<data_sink>> sinks,
                   chunked_parquet_writer_options const& options,
-                  single_write_mode mode,
+                  cudf::io::detail::single_write_mode mode,
                   rmm::cuda_stream_view stream);
 
   /**
@@ -250,5 +250,5 @@ class writer {
  * metadata.
  */
 parquet_metadata read_parquet_metadata(host_span<std::unique_ptr<datasource> const> sources);
-}  // namespace detail::parquet
+}  // namespace parquet::detail
 }  // namespace cudf::io
diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp
index d408d249a7f..472d42b1db5 100644
--- a/cpp/include/cudf/io/json.hpp
+++ b/cpp/include/cudf/io/json.hpp
@@ -121,7 +121,7 @@ class json_reader_options {
    *
    * @param src source information used to read parquet file
    */
-  explicit json_reader_options(source_info const& src) : _source(src) {}
+  explicit json_reader_options(source_info src) : _source{std::move(src)} {}
 
   friend json_reader_options_builder;
 
@@ -139,7 +139,7 @@ class json_reader_options {
    * @param src source information used to read json file
    * @returns builder to build the options
    */
-  static json_reader_options_builder builder(source_info const& src);
+  static json_reader_options_builder builder(source_info src);
 
   /**
    * @brief Returns source info.
@@ -351,7 +351,7 @@ class json_reader_options_builder {
    *
    * @param src The source information used to read avro file
    */
-  explicit json_reader_options_builder(source_info const& src) : options(src) {}
+  explicit json_reader_options_builder(source_info src) : options{std::move(src)} {}
 
   /**
    * @brief Set data types for columns to be read.
@@ -512,6 +512,7 @@ class json_reader_options_builder {
  * @endcode
  *
  * @param options Settings for controlling reading behavior
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate device memory of the table in the returned
  * table_with_metadata.
  *
@@ -519,6 +520,7 @@ class json_reader_options_builder {
  */
 table_with_metadata read_json(
   json_reader_options options,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
@@ -861,9 +863,11 @@ class json_writer_options_builder {
  * @endcode
  *
  * @param options Settings for controlling writing behavior
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource to use for device memory allocation
  */
 void write_json(json_writer_options const& options,
+                rmm::cuda_stream_view stream        = cudf::get_default_stream(),
                 rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/include/cudf/io/orc.hpp b/cpp/include/cudf/io/orc.hpp
index 024f4f23b94..c2762b05aa6 100644
--- a/cpp/include/cudf/io/orc.hpp
+++ b/cpp/include/cudf/io/orc.hpp
@@ -80,7 +80,7 @@ class orc_reader_options {
    *
    * @param src source information used to read orc file
    */
-  explicit orc_reader_options(source_info const& src) : _source(src) {}
+  explicit orc_reader_options(source_info src) : _source{std::move(src)} {}
 
  public:
   /**
@@ -96,7 +96,7 @@ class orc_reader_options {
    * @param src Source information to read orc file
    * @return Builder to build reader options
    */
-  static orc_reader_options_builder builder(source_info const& src);
+  static orc_reader_options_builder builder(source_info src);
 
   /**
    * @brief Returns source info.
@@ -269,7 +269,7 @@ class orc_reader_options_builder {
    *
    * @param src The source information used to read orc file
    */
-  explicit orc_reader_options_builder(source_info const& src) : options{src} {};
+  explicit orc_reader_options_builder(source_info src) : options{std::move(src)} {};
 
   /**
    * @brief Sets names of the column to read.
@@ -450,6 +450,8 @@ class orc_writer_options {
   std::map<std::string, std::string> _user_data;
   // Optional compression statistics
   std::shared_ptr<writer_compression_statistics> _compression_stats;
+  // Specify whether string dictionaries should be alphabetically sorted
+  bool _enable_dictionary_sort = true;
 
   friend orc_writer_options_builder;
 
@@ -572,6 +574,13 @@ class orc_writer_options {
     return _compression_stats;
   }
 
+  /**
+   * @brief Returns whether string dictionaries should be sorted.
+   *
+   * @return `true` if string dictionaries should be sorted
+   */
+  [[nodiscard]] bool get_enable_dictionary_sort() const { return _enable_dictionary_sort; }
+
   // Setters
 
   /**
@@ -670,6 +679,13 @@ class orc_writer_options {
   {
     _compression_stats = std::move(comp_stats);
   }
+
+  /**
+   * @brief Sets whether string dictionaries should be sorted.
+   *
+   * @param val Boolean value to enable/disable
+   */
+  void set_enable_dictionary_sort(bool val) { _enable_dictionary_sort = val; }
 };
 
 /**
@@ -810,6 +826,18 @@ class orc_writer_options_builder {
     return *this;
   }
 
+  /**
+   * @brief Sets whether string dictionaries should be sorted.
+   *
+   * @param val Boolean value to enable/disable
+   * @return this for chaining
+   */
+  orc_writer_options_builder& enable_dictionary_sort(bool val)
+  {
+    options._enable_dictionary_sort = val;
+    return *this;
+  }
+
   /**
    * @brief move orc_writer_options member once it's built.
    */
@@ -866,6 +894,8 @@ class chunked_orc_writer_options {
   std::map<std::string, std::string> _user_data;
   // Optional compression statistics
   std::shared_ptr<writer_compression_statistics> _compression_stats;
+  // Specify whether string dictionaries should be alphabetically sorted
+  bool _enable_dictionary_sort = true;
 
   friend chunked_orc_writer_options_builder;
 
@@ -966,6 +996,13 @@ class chunked_orc_writer_options {
     return _compression_stats;
   }
 
+  /**
+   * @brief Returns whether string dictionaries should be sorted.
+   *
+   * @return `true` if string dictionaries should be sorted
+   */
+  [[nodiscard]] bool get_enable_dictionary_sort() const { return _enable_dictionary_sort; }
+
   // Setters
 
   /**
@@ -1057,6 +1094,13 @@ class chunked_orc_writer_options {
   {
     _compression_stats = std::move(comp_stats);
   }
+
+  /**
+   * @brief Sets whether string dictionaries should be sorted.
+   *
+   * @param val Boolean value to enable/disable
+   */
+  void set_enable_dictionary_sort(bool val) { _enable_dictionary_sort = val; }
 };
 
 /**
@@ -1183,6 +1227,18 @@ class chunked_orc_writer_options_builder {
     return *this;
   }
 
+  /**
+   * @brief Sets whether string dictionaries should be sorted.
+   *
+   * @param val Boolean value to enable/disable
+   * @return this for chaining
+   */
+  chunked_orc_writer_options_builder& enable_dictionary_sort(bool val)
+  {
+    options._enable_dictionary_sort = val;
+    return *this;
+  }
+
   /**
    * @brief move chunked_orc_writer_options member once it's built.
    */
diff --git a/cpp/include/cudf/io/orc_metadata.hpp b/cpp/include/cudf/io/orc_metadata.hpp
index 82d59803c25..9531a012e49 100644
--- a/cpp/include/cudf/io/orc_metadata.hpp
+++ b/cpp/include/cudf/io/orc_metadata.hpp
@@ -141,10 +141,10 @@ using binary_statistics = sum_statistics<int64_t>;
  * the UNIX epoch. The `minimum_utc` and `maximum_utc` are the same values adjusted to UTC.
  */
 struct timestamp_statistics : minmax_statistics<int64_t> {
-  std::optional<int64_t> minimum_utc;    ///< minimum in milliseconds
-  std::optional<int64_t> maximum_utc;    ///< maximum in milliseconds
-  std::optional<int32_t> minimum_nanos;  ///< nanoseconds part of the minimum
-  std::optional<int32_t> maximum_nanos;  ///< nanoseconds part of the maximum
+  std::optional<int64_t> minimum_utc;     ///< minimum in milliseconds
+  std::optional<int64_t> maximum_utc;     ///< maximum in milliseconds
+  std::optional<uint32_t> minimum_nanos;  ///< nanoseconds part of the minimum
+  std::optional<uint32_t> maximum_nanos;  ///< nanoseconds part of the maximum
 };
 
 namespace orc {
diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp
index deaf23d405a..ea18da74d5a 100644
--- a/cpp/include/cudf/io/parquet.hpp
+++ b/cpp/include/cudf/io/parquet.hpp
@@ -80,7 +80,7 @@ class parquet_reader_options {
    *
    * @param src source information used to read parquet file
    */
-  explicit parquet_reader_options(source_info const& src) : _source(src) {}
+  explicit parquet_reader_options(source_info src) : _source{std::move(src)} {}
 
   friend parquet_reader_options_builder;
 
@@ -98,7 +98,7 @@ class parquet_reader_options {
    * @param src Source information to read parquet file
    * @return Builder to build reader options
    */
-  static parquet_reader_options_builder builder(source_info const& src);
+  static parquet_reader_options_builder builder(source_info src);
 
   /**
    * @brief Returns source info.
@@ -265,7 +265,7 @@ class parquet_reader_options_builder {
    *
    * @param src The source information used to read parquet file
    */
-  explicit parquet_reader_options_builder(source_info const& src) : options(src) {}
+  explicit parquet_reader_options_builder(source_info src) : options{std::move(src)} {}
 
   /**
    * @brief Sets names of the columns to be read.
@@ -499,7 +499,7 @@ class chunked_parquet_reader {
   [[nodiscard]] table_with_metadata read_chunk() const;
 
  private:
-  std::unique_ptr<cudf::io::detail::parquet::chunked_reader> reader;
+  std::unique_ptr<cudf::io::parquet::detail::chunked_reader> reader;
 };
 
 /** @} */  // end of group
@@ -532,6 +532,9 @@ class parquet_writer_options {
   // Parquet writer can write INT96 or TIMESTAMP_MICROS. Defaults to TIMESTAMP_MICROS.
   // If true then overrides any per-column setting in _metadata.
   bool _write_timestamps_as_int96 = false;
+  // Parquet writer can write timestamps as UTC
+  // Defaults to true because libcudf timestamps are implicitly UTC
+  bool _write_timestamps_as_UTC = true;
   // Column chunks file paths to be set in the raw output metadata. One per output file
   std::vector<std::string> _column_chunks_file_paths;
   // Maximum size of each row group (unless smaller than a single page)
@@ -652,6 +655,13 @@ class parquet_writer_options {
    */
   bool is_enabled_int96_timestamps() const { return _write_timestamps_as_int96; }
 
+  /**
+   * @brief Returns `true` if timestamps will be written as UTC
+   *
+   * @return `true` if timestamps will be written as UTC
+   */
+  [[nodiscard]] auto is_enabled_utc_timestamps() const { return _write_timestamps_as_UTC; }
+
   /**
    * @brief Returns Column chunks file paths to be set in the raw output metadata.
    *
@@ -789,6 +799,13 @@ class parquet_writer_options {
    */
   void enable_int96_timestamps(bool req) { _write_timestamps_as_int96 = req; }
 
+  /**
+   * @brief Sets preference for writing timestamps as UTC. Write timestamps as UTC if set to `true`.
+   *
+   * @param val Boolean value to enable/disable writing of timestamps as UTC.
+   */
+  void enable_utc_timestamps(bool val) { _write_timestamps_as_UTC = val; }
+
   /**
    * @brief Sets column chunks file path to be set in the raw output metadata.
    *
@@ -1100,6 +1117,18 @@ class parquet_writer_options_builder {
     return *this;
   }
 
+  /**
+   * @brief Set to true if timestamps are to be written as UTC.
+   *
+   * @param enabled Boolean value to enable/disable writing of timestamps as UTC.
+   * @return this for chaining
+   */
+  parquet_writer_options_builder& utc_timestamps(bool enabled)
+  {
+    options._write_timestamps_as_UTC = enabled;
+    return *this;
+  }
+
   /**
    * @brief Set to true if V2 page headers are to be written.
    *
@@ -1171,6 +1200,8 @@ class chunked_parquet_writer_options {
   // Parquet writer can write INT96 or TIMESTAMP_MICROS. Defaults to TIMESTAMP_MICROS.
   // If true then overrides any per-column setting in _metadata.
   bool _write_timestamps_as_int96 = false;
+  // Parquet writer can write timestamps as UTC. Defaults to true.
+  bool _write_timestamps_as_UTC = true;
   // Maximum size of each row group (unless smaller than a single page)
   size_t _row_group_size_bytes = default_row_group_size_bytes;
   // Maximum number of rows in row group (unless smaller than a single page)
@@ -1254,6 +1285,13 @@ class chunked_parquet_writer_options {
    */
   bool is_enabled_int96_timestamps() const { return _write_timestamps_as_int96; }
 
+  /**
+   * @brief Returns `true` if timestamps will be written as UTC
+   *
+   * @return `true` if timestamps will be written as UTC
+   */
+  [[nodiscard]] auto is_enabled_utc_timestamps() const { return _write_timestamps_as_UTC; }
+
   /**
    * @brief Returns maximum row group size, in bytes.
    *
@@ -1375,6 +1413,13 @@ class chunked_parquet_writer_options {
    */
   void enable_int96_timestamps(bool req) { _write_timestamps_as_int96 = req; }
 
+  /**
+   * @brief Sets preference for writing timestamps as UTC. Write timestamps as UTC if set to `true`.
+   *
+   * @param val Boolean value to enable/disable writing of timestamps as UTC.
+   */
+  void enable_utc_timestamps(bool val) { _write_timestamps_as_UTC = val; }
+
   /**
    * @brief Sets the maximum row group size, in bytes.
    *
@@ -1539,6 +1584,18 @@ class chunked_parquet_writer_options_builder {
     return *this;
   }
 
+  /**
+   * @brief Set to true if timestamps are to be written as UTC.
+   *
+   * @param enabled Boolean value to enable/disable writing of timestamps as UTC.
+   * @return this for chaining
+   */
+  chunked_parquet_writer_options_builder& utc_timestamps(bool enabled)
+  {
+    options._write_timestamps_as_UTC = enabled;
+    return *this;
+  }
+
   /**
    * @brief Set to true if V2 page headers are to be written.
    *
@@ -1750,7 +1807,7 @@ class parquet_chunked_writer {
     std::vector<std::string> const& column_chunks_file_paths = {});
 
   /// Unique pointer to impl writer class
-  std::unique_ptr<cudf::io::detail::parquet::writer> writer;
+  std::unique_ptr<parquet::detail::writer> writer;
 };
 
 /** @} */  // end of group
diff --git a/cpp/include/cudf/io/types.hpp b/cpp/include/cudf/io/types.hpp
index a97f81182ac..50119e60882 100644
--- a/cpp/include/cudf/io/types.hpp
+++ b/cpp/include/cudf/io/types.hpp
@@ -195,9 +195,9 @@ class writer_compression_statistics {
  * @brief Control use of dictionary encoding for parquet writer
  */
 enum dictionary_policy {
-  NEVER,     ///< Never use dictionary encoding
-  ADAPTIVE,  ///< Use dictionary when it will not impact compression
-  ALWAYS     ///< Use dictionary reqardless of impact on compression
+  NEVER    = 0,  ///< Never use dictionary encoding
+  ADAPTIVE = 1,  ///< Use dictionary when it will not impact compression
+  ALWAYS   = 2   ///< Use dictionary regardless of impact on compression
 };
 
 /**
@@ -293,14 +293,20 @@ struct source_info {
    *
    * @param file_paths Input files paths
    */
-  explicit source_info(std::vector<std::string> const& file_paths) : _filepaths(file_paths) {}
+  explicit source_info(std::vector<std::string> const& file_paths)
+    : _type(io_type::FILEPATH), _filepaths(file_paths)
+  {
+  }
 
   /**
    * @brief Construct a new source info object for a single file
    *
    * @param file_path Single input file
    */
-  explicit source_info(std::string const& file_path) : _filepaths({file_path}) {}
+  explicit source_info(std::string const& file_path)
+    : _type(io_type::FILEPATH), _filepaths({file_path})
+  {
+  }
 
   /**
    * @brief Construct a new source info object for multiple buffers in host memory
@@ -444,7 +450,7 @@ struct source_info {
   [[nodiscard]] auto const& user_sources() const { return _user_sources; }
 
  private:
-  io_type _type = io_type::FILEPATH;
+  io_type _type = io_type::VOID;
   std::vector<std::string> _filepaths;
   std::vector<cudf::host_span<std::byte const>> _host_buffers;
   std::vector<cudf::device_span<std::byte const>> _device_buffers;
diff --git a/cpp/include/cudf/strings/json.hpp b/cpp/include/cudf/json/json.hpp
similarity index 94%
rename from cpp/include/cudf/strings/json.hpp
rename to cpp/include/cudf/json/json.hpp
index 8fabee6b9a5..944e0c26dd6 100644
--- a/cpp/include/cudf/strings/json.hpp
+++ b/cpp/include/cudf/json/json.hpp
@@ -16,16 +16,16 @@
 #pragma once
 
 #include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
 
 #include <thrust/optional.h>
 
 namespace cudf {
-namespace strings {
 
 /**
- * @addtogroup strings_json
+ * @addtogroup json_object
  * @{
  * @file
  */
@@ -155,20 +155,21 @@ class get_json_object_options {
  * https://tools.ietf.org/id/draft-goessner-dispatch-jsonpath-00.html
  * Implements only the operators: $ . [] *
  *
+ * @throw std::invalid_argument if provided an invalid operator or an empty name
+ *
  * @param col The input strings column. Each row must contain a valid json string
  * @param json_path The JSONPath string to be applied to each row
  * @param options Options for controlling the behavior of the function
- * @param mr Resource for allocating device memory.
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Resource for allocating device memory
  * @return New strings column containing the retrieved json object strings
- *
- * @throw std::invalid_argument if provided an invalid operator or an empty name
  */
 std::unique_ptr<cudf::column> get_json_object(
   cudf::strings_column_view const& col,
   cudf::string_scalar const& json_path,
   get_json_object_options options     = get_json_object_options{},
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
-}  // namespace strings
 }  // namespace cudf
diff --git a/cpp/include/cudf/lists/combine.hpp b/cpp/include/cudf/lists/combine.hpp
index 0bc76828fc3..0d9c1c157eb 100644
--- a/cpp/include/cudf/lists/combine.hpp
+++ b/cpp/include/cudf/lists/combine.hpp
@@ -57,6 +57,7 @@ enum class concatenate_null_policy { IGNORE, NULLIFY_OUTPUT_ROW };
  * @param input Table of lists to be concatenated.
  * @param null_policy The parameter to specify whether a null list element will be ignored from
  *        concatenation, or any concatenation involving a null element will result in a null list.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return A new column in which each row is a list resulted from concatenating all list elements in
  *         the corresponding row of the input table.
@@ -64,6 +65,7 @@ enum class concatenate_null_policy { IGNORE, NULLIFY_OUTPUT_ROW };
 std::unique_ptr<column> concatenate_rows(
   table_view const& input,
   concatenate_null_policy null_policy = concatenate_null_policy::IGNORE,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -86,6 +88,7 @@ std::unique_ptr<column> concatenate_rows(
  * @param input The lists column containing lists of list elements to concatenate.
  * @param null_policy The parameter to specify whether a null list element will be ignored from
  *        concatenation, or any concatenation involving a null element will result in a null list.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return A new column in which each row is a list resulted from concatenating all list elements in
  *         the corresponding row of the input lists column.
@@ -93,6 +96,7 @@ std::unique_ptr<column> concatenate_rows(
 std::unique_ptr<column> concatenate_list_elements(
   column_view const& input,
   concatenate_null_policy null_policy = concatenate_null_policy::IGNORE,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/include/cudf/lists/contains.hpp b/cpp/include/cudf/lists/contains.hpp
index 21c2ca1d64e..7cf67ec9205 100644
--- a/cpp/include/cudf/lists/contains.hpp
+++ b/cpp/include/cudf/lists/contains.hpp
@@ -42,12 +42,14 @@ namespace lists {
  *
  * @param lists Lists column whose `n` rows are to be searched
  * @param search_key The scalar key to be looked up in each list row
+ * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return BOOL8 column of `n` rows with the result of the lookup
  */
 std::unique_ptr<column> contains(
   cudf::lists_column_view const& lists,
   cudf::scalar const& search_key,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -63,13 +65,15 @@ std::unique_ptr<column> contains(
  *   2. The list row `lists[i]` is null
  *
  * @param lists Lists column whose `n` rows are to be searched
- * @param search_keys Column of elements to be looked up in each list row
+ * @param search_keys Column of elements to be looked up in each list row.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return BOOL8 column of `n` rows with the result of the lookup
  */
 std::unique_ptr<column> contains(
   cudf::lists_column_view const& lists,
   cudf::column_view const& search_keys,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -84,12 +88,14 @@ std::unique_ptr<column> contains(
  * A row with an empty list will always return false.
  * Nulls inside non-null nested elements (such as lists or structs) are not considered.
  *
- * @param lists Lists column whose `n` rows are to be searched
+ * @param lists Lists column whose `n` rows are to be searched.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return BOOL8 column of `n` rows with the result of the lookup
  */
 std::unique_ptr<column> contains_nulls(
   cudf::lists_column_view const& lists,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -125,6 +131,7 @@ enum class duplicate_find_option : int32_t {
  * @param search_key The scalar key to be looked up in each list row
  * @param find_option Whether to return the position of the first match (`FIND_FIRST`) or
  * last (`FIND_LAST`)
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return column of `n` rows with the location of the `search_key`
  */
@@ -132,6 +139,7 @@ std::unique_ptr<column> index_of(
   cudf::lists_column_view const& lists,
   cudf::scalar const& search_key,
   duplicate_find_option find_option   = duplicate_find_option::FIND_FIRST,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -160,6 +168,7 @@ std::unique_ptr<column> index_of(
  * `lists`
  * @param find_option Whether to return the position of the first match (`FIND_FIRST`) or
  * last (`FIND_LAST`)
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return column of `n` rows with the location of the `search_key`
  */
@@ -167,6 +176,7 @@ std::unique_ptr<column> index_of(
   cudf::lists_column_view const& lists,
   cudf::column_view const& search_keys,
   duplicate_find_option find_option   = duplicate_find_option::FIND_FIRST,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/include/cudf/lists/count_elements.hpp b/cpp/include/cudf/lists/count_elements.hpp
index 552ba058b93..e4bd0dca9ae 100644
--- a/cpp/include/cudf/lists/count_elements.hpp
+++ b/cpp/include/cudf/lists/count_elements.hpp
@@ -45,11 +45,13 @@ namespace lists {
  * in the output column.
  *
  * @param input Input lists column
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return New column with the number of elements for each row
  */
 std::unique_ptr<column> count_elements(
   lists_column_view const& input,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of lists_elements group
diff --git a/cpp/include/cudf/lists/detail/scatter.cuh b/cpp/include/cudf/lists/detail/scatter.cuh
index f04b2fda2bf..ff148c59a23 100644
--- a/cpp/include/cudf/lists/detail/scatter.cuh
+++ b/cpp/include/cudf/lists/detail/scatter.cuh
@@ -20,9 +20,9 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/copying.hpp>
 #include <cudf/detail/iterator.cuh>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/lists/detail/scatter_helper.cuh>
 #include <cudf/lists/list_device_view.cuh>
-#include <cudf/null_mask.hpp>
 #include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -130,8 +130,8 @@ std::unique_ptr<column> scatter_impl(rmm::device_uvector<unbound_list_view> cons
   std::vector<std::unique_ptr<column>> children;
   children.emplace_back(std::move(offsets_column));
   children.emplace_back(std::move(child_column));
-  auto null_mask =
-    target.has_nulls() ? copy_bitmask(target, stream, mr) : rmm::device_buffer{0, stream, mr};
+  auto null_mask = target.has_nulls() ? cudf::detail::copy_bitmask(target, stream, mr)
+                                      : rmm::device_buffer{0, stream, mr};
 
   // The output column from this function only has null masks copied from the target columns.
   // That is still not a correct final null mask for the scatter result.
diff --git a/cpp/include/cudf/lists/extract.hpp b/cpp/include/cudf/lists/extract.hpp
index e92354134e8..14c0f59e17d 100644
--- a/cpp/include/cudf/lists/extract.hpp
+++ b/cpp/include/cudf/lists/extract.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -59,12 +59,14 @@ namespace lists {
  *
  * @param lists_column Column to extract elements from.
  * @param index The row within each sublist to retrieve.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return Column of extracted elements.
  */
 std::unique_ptr<column> extract_list_element(
   lists_column_view const& lists_column,
   size_type index,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -97,6 +99,7 @@ std::unique_ptr<column> extract_list_element(
  * @param lists_column Column to extract elements from.
  * @param indices The column whose rows indicate the element index to be retrieved from each list
  * row.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return Column of extracted elements.
  * @throws cudf::logic_error If the sizes of `lists_column` and `indices` do not match.
@@ -104,6 +107,7 @@ std::unique_ptr<column> extract_list_element(
 std::unique_ptr<column> extract_list_element(
   lists_column_view const& lists_column,
   column_view const& indices,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/include/cudf/lists/filling.hpp b/cpp/include/cudf/lists/filling.hpp
index 059ed5ffd33..3730e16482d 100644
--- a/cpp/include/cudf/lists/filling.hpp
+++ b/cpp/include/cudf/lists/filling.hpp
@@ -17,7 +17,9 @@
 #pragma once
 
 #include <cudf/types.hpp>
+#include <cudf/utilities/default_stream.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
 
 #include <memory>
@@ -57,12 +59,14 @@ namespace cudf::lists {
  *
  * @param starts First values in the result sequences.
  * @param sizes Numbers of values in the result sequences.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return The result column containing generated sequences.
  */
 std::unique_ptr<column> sequences(
   column_view const& starts,
   column_view const& sizes,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -96,6 +100,7 @@ std::unique_ptr<column> sequences(
  * @param starts First values in the result sequences.
  * @param steps Increment values for the result sequences.
  * @param sizes Numbers of values in the result sequences.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return The result column containing generated sequences.
  */
@@ -103,6 +108,7 @@ std::unique_ptr<column> sequences(
   column_view const& starts,
   column_view const& steps,
   column_view const& sizes,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/include/cudf/lists/gather.hpp b/cpp/include/cudf/lists/gather.hpp
index 38bed9ede43..5e6ab6816e6 100644
--- a/cpp/include/cudf/lists/gather.hpp
+++ b/cpp/include/cudf/lists/gather.hpp
@@ -65,6 +65,7 @@ namespace lists {
  * @param bounds_policy Can be `DONT_CHECK` or `NULLIFY`. Selects whether or not to nullify the
  * output list row's element, when the gather index falls outside the range `[-n, n)`,
  * where `n` is the number of elements in list row corresponding to the gather-map row.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource to allocate any returned objects
  * @return column with elements in list of rows gathered based on `gather_map_list`
  *
@@ -73,6 +74,7 @@ std::unique_ptr<column> segmented_gather(
   lists_column_view const& source_column,
   lists_column_view const& gather_map_list,
   out_of_bounds_policy bounds_policy  = out_of_bounds_policy::DONT_CHECK,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/include/cudf/lists/reverse.hpp b/cpp/include/cudf/lists/reverse.hpp
index 226d417c53a..864cd796f72 100644
--- a/cpp/include/cudf/lists/reverse.hpp
+++ b/cpp/include/cudf/lists/reverse.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -42,11 +42,13 @@ namespace cudf::lists {
  * @endcode
  *
  * @param input Lists column for this operation
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return New lists column with reversed lists
  */
 std::unique_ptr<column> reverse(
   lists_column_view const& input,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
diff --git a/cpp/include/cudf/lists/set_operations.hpp b/cpp/include/cudf/lists/set_operations.hpp
index 9d58d0f5b98..6fb8989f0bb 100644
--- a/cpp/include/cudf/lists/set_operations.hpp
+++ b/cpp/include/cudf/lists/set_operations.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -53,6 +53,7 @@ namespace cudf::lists {
  *        to be `UNEQUAL` which means only non-null elements are checked for overlapping
  * @param nans_equal Flag to specify whether floating-point NaNs should be considered as equal
  * @param mr Device memory resource used to allocate the returned object
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @return A column of type BOOL containing the check results
  */
 std::unique_ptr<column> have_overlap(
@@ -60,6 +61,7 @@ std::unique_ptr<column> have_overlap(
   lists_column_view const& rhs,
   null_equality nulls_equal           = null_equality::EQUAL,
   nan_equality nans_equal             = nan_equality::ALL_EQUAL,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -87,6 +89,7 @@ std::unique_ptr<column> have_overlap(
  * @param rhs The input lists column for the other side
  * @param nulls_equal Flag to specify whether null elements should be considered as equal
  * @param nans_equal Flag to specify whether floating-point NaNs should be considered as equal
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned object
  * @return A lists column containing the intersection results
  */
@@ -95,6 +98,7 @@ std::unique_ptr<column> intersect_distinct(
   lists_column_view const& rhs,
   null_equality nulls_equal           = null_equality::EQUAL,
   nan_equality nans_equal             = nan_equality::ALL_EQUAL,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -122,6 +126,7 @@ std::unique_ptr<column> intersect_distinct(
  * @param rhs The input lists column for the other side
  * @param nulls_equal Flag to specify whether null elements should be considered as equal
  * @param nans_equal Flag to specify whether floating-point NaNs should be considered as equal
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned object
  * @return A lists column containing the union results
  */
@@ -130,6 +135,7 @@ std::unique_ptr<column> union_distinct(
   lists_column_view const& rhs,
   null_equality nulls_equal           = null_equality::EQUAL,
   nan_equality nans_equal             = nan_equality::ALL_EQUAL,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -157,6 +163,7 @@ std::unique_ptr<column> union_distinct(
  * @param rhs The input lists column of elements to exclude
  * @param nulls_equal Flag to specify whether null elements should be considered as equal
  * @param nans_equal Flag to specify whether floating-point NaNs should be considered as equal
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned object
  * @return A lists column containing the difference results
  */
@@ -165,6 +172,7 @@ std::unique_ptr<column> difference_distinct(
   lists_column_view const& rhs,
   null_equality nulls_equal           = null_equality::EQUAL,
   nan_equality nans_equal             = nan_equality::ALL_EQUAL,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/include/cudf/lists/sorting.hpp b/cpp/include/cudf/lists/sorting.hpp
index c203c452b0d..39a52c75a98 100644
--- a/cpp/include/cudf/lists/sorting.hpp
+++ b/cpp/include/cudf/lists/sorting.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -46,6 +46,7 @@ namespace lists {
  * @param source_column View of the list column of numeric types to sort
  * @param column_order The desired sort order
  * @param null_precedence The desired order of null compared to other elements in the list
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource to allocate any returned objects
  * @return list column with elements in each list sorted.
  *
@@ -54,6 +55,7 @@ std::unique_ptr<column> sort_lists(
   lists_column_view const& source_column,
   order column_order,
   null_order null_precedence,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -66,6 +68,7 @@ std::unique_ptr<column> stable_sort_lists(
   lists_column_view const& source_column,
   order column_order,
   null_order null_precedence,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/include/cudf/lists/stream_compaction.hpp b/cpp/include/cudf/lists/stream_compaction.hpp
index 5ddaa992184..3ac4f6861ec 100644
--- a/cpp/include/cudf/lists/stream_compaction.hpp
+++ b/cpp/include/cudf/lists/stream_compaction.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -54,12 +54,14 @@ namespace cudf::lists {
  *
  * @param input The input list column view to be filtered
  * @param boolean_mask A nullable list of bools column used to filter `input` elements
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table's device memory
  * @return List column of the same type as `input`, containing filtered list rows
  */
 std::unique_ptr<column> apply_boolean_mask(
   lists_column_view const& input,
   lists_column_view const& boolean_mask,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -78,6 +80,7 @@ std::unique_ptr<column> apply_boolean_mask(
  * @param input The input lists column
  * @param nulls_equal Flag to specify whether null elements should be considered as equal
  * @param nans_equal Flag to specify whether floating-point NaNs should be considered as equal
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned object
  * @return The resulting lists column containing lists without duplicates
  */
@@ -85,6 +88,7 @@ std::unique_ptr<column> distinct(
   lists_column_view const& input,
   null_equality nulls_equal           = null_equality::EQUAL,
   nan_equality nans_equal             = nan_equality::ALL_EQUAL,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/include/cudf/merge.hpp b/cpp/include/cudf/merge.hpp
index 3d09550209d..8886ec24bfe 100644
--- a/cpp/include/cudf/merge.hpp
+++ b/cpp/include/cudf/merge.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -34,7 +34,10 @@ namespace cudf {
  * @brief Merge a set of sorted tables.
  *
  * Merges sorted tables into one sorted table
- * containing data from all tables.
+ * containing data from all tables. The key columns
+ * of each table must be sorted according to the
+ * parameters (cudf::column_order and cudf::null_order)
+ * specified for that column.
  *
  * ```
  * Example 1:
diff --git a/cpp/include/cudf/null_mask.hpp b/cpp/include/cudf/null_mask.hpp
index 672f479ad53..524296e60ca 100644
--- a/cpp/include/cudf/null_mask.hpp
+++ b/cpp/include/cudf/null_mask.hpp
@@ -16,6 +16,7 @@
 #pragma once
 
 #include <cudf/types.hpp>
+#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/device_buffer.hpp>
@@ -80,6 +81,7 @@ size_type num_bitmask_words(size_type number_of_bits);
  *
  * @param size The number of elements to be represented by the mask
  * @param state The desired state of the mask
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned device_buffer
  * @return A `device_buffer` for use as a null bitmask
  * satisfying the desired size and state
@@ -87,6 +89,7 @@ size_type num_bitmask_words(size_type number_of_bits);
 rmm::device_buffer create_null_mask(
   size_type size,
   mask_state state,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -100,8 +103,13 @@ rmm::device_buffer create_null_mask(
  * @param begin_bit Index of the first bit to set (inclusive)
  * @param end_bit Index of the last bit to set (exclusive)
  * @param valid If true set all entries to valid; otherwise, set all to null
+ * @param stream CUDA stream used for device memory operations and kernel launches
  */
-void set_null_mask(bitmask_type* bitmask, size_type begin_bit, size_type end_bit, bool valid);
+void set_null_mask(bitmask_type* bitmask,
+                   size_type begin_bit,
+                   size_type end_bit,
+                   bool valid,
+                   rmm::cuda_stream_view stream = cudf::get_default_stream());
 
 /**
  * @brief Creates a `device_buffer` from a slice of bitmask defined by a range
@@ -115,6 +123,7 @@ void set_null_mask(bitmask_type* bitmask, size_type begin_bit, size_type end_bit
  * @param mask Bitmask residing in device memory whose bits will be copied
  * @param begin_bit Index of the first bit to be copied (inclusive)
  * @param end_bit Index of the last bit to be copied (exclusive)
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned device_buffer
  * @return A `device_buffer` containing the bits
  * `[begin_bit, end_bit)` from `mask`.
@@ -123,6 +132,7 @@ rmm::device_buffer copy_bitmask(
   bitmask_type const* mask,
   size_type begin_bit,
   size_type end_bit,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -132,12 +142,14 @@ rmm::device_buffer copy_bitmask(
  * Returns empty `device_buffer` if the column is not nullable
  *
  * @param view Column view whose bitmask needs to be copied
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned device_buffer
  * @return A `device_buffer` containing the bits
  * `[view.offset(), view.offset() + view.size())` from `view`'s bitmask.
  */
 rmm::device_buffer copy_bitmask(
   column_view const& view,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -148,11 +160,13 @@ rmm::device_buffer copy_bitmask(
  * If no column in the table is nullable, an empty bitmask is returned.
  *
  * @param view The table of columns
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned device_buffer
  * @return A pair of resulting bitmask and count of unset bits
  */
 std::pair<rmm::device_buffer, size_type> bitmask_and(
   table_view const& view,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -163,11 +177,13 @@ std::pair<rmm::device_buffer, size_type> bitmask_and(
  * If no column in the table is nullable, an empty bitmask is returned.
  *
  * @param view The table of columns
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned device_buffer
  * @return A pair of resulting bitmask and count of unset bits
  */
 std::pair<rmm::device_buffer, size_type> bitmask_or(
   table_view const& view,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -183,8 +199,12 @@ std::pair<rmm::device_buffer, size_type> bitmask_or(
  * @param bitmask Validity bitmask residing in device memory.
  * @param start Index of the first bit to count (inclusive).
  * @param stop Index of the last bit to count (exclusive).
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @return The number of null elements in the specified range.
  */
-cudf::size_type null_count(bitmask_type const* bitmask, size_type start, size_type stop);
+cudf::size_type null_count(bitmask_type const* bitmask,
+                           size_type start,
+                           size_type stop,
+                           rmm::cuda_stream_view stream = cudf::get_default_stream());
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/strings/char_types/char_types.hpp b/cpp/include/cudf/strings/char_types/char_types.hpp
index 8b6c434719a..c6db5dab08a 100644
--- a/cpp/include/cudf/strings/char_types/char_types.hpp
+++ b/cpp/include/cudf/strings/char_types/char_types.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -53,18 +53,20 @@ namespace strings {
  *
  * Any null row results in a null entry for that row in the output column.
  *
- * @param strings Strings instance for this operation.
- * @param types The character types to check in each string.
+ * @param input Strings instance for this operation
+ * @param types The character types to check in each string
  * @param verify_types Only verify against these character types.
  *                     Default `ALL_TYPES` means return `true`
  *                     iff all characters match `types`.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New column of boolean results for each string.
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New column of boolean results for each string
  */
 std::unique_ptr<column> all_characters_of_type(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   string_character_types types,
   string_character_types verify_types = string_character_types::ALL_TYPES,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -96,20 +98,22 @@ std::unique_ptr<column> all_characters_of_type(
  * @throw cudf::logic_error if neither or both `types_to_remove` and
  *        `types_to_keep` are set to `ALL_TYPES`.
  *
- * @param strings Strings instance for this operation.
+ * @param input Strings instance for this operation
  * @param types_to_remove The character types to check in each string.
  *        Use `ALL_TYPES` here to specify `types_to_keep` instead.
- * @param replacement The replacement character to use when removing characters.
+ * @param replacement The replacement character to use when removing characters
  * @param types_to_keep Default `ALL_TYPES` means all characters of
  *        `types_to_remove` will be filtered.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New column of boolean results for each string.
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @return New column of boolean results for each string
  */
 std::unique_ptr<column> filter_characters_of_type(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   string_character_types types_to_remove,
   string_scalar const& replacement     = string_scalar(""),
   string_character_types types_to_keep = string_character_types::ALL_TYPES,
+  rmm::cuda_stream_view stream         = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr  = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
diff --git a/cpp/include/cudf/strings/combine.hpp b/cpp/include/cudf/strings/combine.hpp
index 71f65ac9080..568e8ac50ec 100644
--- a/cpp/include/cudf/strings/combine.hpp
+++ b/cpp/include/cudf/strings/combine.hpp
@@ -66,18 +66,20 @@ enum class output_if_empty_list {
  *
  * @throw cudf::logic_error if separator is not valid.
  *
- * @param strings Strings for this operation.
+ * @param input Strings for this operation
  * @param separator String that should inserted between each string.
  *        Default is an empty string.
- * @param narep String that should represent any null strings found.
+ * @param narep String to replace any null strings found.
  *        Default of invalid-scalar will ignore any null entries.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New column containing one string.
  */
 std::unique_ptr<column> join_strings(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   string_scalar const& separator      = string_scalar(""),
   string_scalar const& narep          = string_scalar("", false),
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -127,18 +129,17 @@ std::unique_ptr<column> join_strings(
  * @throw cudf::logic_error if the number of rows from @p separators and @p strings_columns
  *                          do not match
  *
- * @param strings_columns List of strings columns to concatenate.
+ * @param strings_columns List of strings columns to concatenate
  * @param separators Strings column that provides the separator for a given row
- * @param separator_narep String that should be used in place of a null separator for a given
- *        row. Default of invalid-scalar means no row separator value replacements.
- *        Default is an invalid string.
- * @param col_narep String that should be used in place of any null strings
- *        found in any column. Default of invalid-scalar means no null column value replacements.
- *        Default is an invalid string.
+ * @param separator_narep String to replace a null separator for a given row.
+ *        Default of invalid-scalar means no row separator value replacements.
+ * @param col_narep String that should be used in place of any null strings found in any column.
+ *        Default of invalid-scalar means no null column value replacements.
  * @param separate_nulls If YES, then the separator is included for null rows
  *        if `col_narep` is valid.
- * @param mr Resource for allocating device memory.
- * @return New column with concatenated results.
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Resource for allocating device memory
+ * @return New column with concatenated results
  */
 std::unique_ptr<column> concatenate(
   table_view const& strings_columns,
@@ -146,6 +147,7 @@ std::unique_ptr<column> concatenate(
   string_scalar const& separator_narep = string_scalar("", false),
   string_scalar const& col_narep       = string_scalar("", false),
   separator_on_nulls separate_nulls    = separator_on_nulls::YES,
+  rmm::cuda_stream_view stream         = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr  = rmm::mr::get_current_device_resource());
 
 /**
@@ -184,21 +186,23 @@ std::unique_ptr<column> concatenate(
  * @throw cudf::logic_error if separator is not valid.
  * @throw cudf::logic_error if only one column is specified
  *
- * @param strings_columns List of string columns to concatenate.
+ * @param strings_columns List of string columns to concatenate
  * @param separator String that should inserted between each string from each row.
  *        Default is an empty string.
- * @param narep String that should be used in place of any null strings
- *        found in any column. Default of invalid-scalar means any null entry in any column will
+ * @param narep String to replace any null strings found in any column.
+ *        Default of invalid-scalar means any null entry in any column will
  *        produces a null result for that row.
- * @param separate_nulls If YES, then the separator is included for null rows if `narep` is valid.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New column with concatenated results.
+ * @param separate_nulls If YES, then the separator is included for null rows if `narep` is valid
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New column with concatenated results
  */
 std::unique_ptr<column> concatenate(
   table_view const& strings_columns,
   string_scalar const& separator      = string_scalar(""),
   string_scalar const& narep          = string_scalar("", false),
   separator_on_nulls separate_nulls   = separator_on_nulls::YES,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -243,19 +247,20 @@ std::unique_ptr<column> concatenate(
  * @throw cudf::logic_error if the number of rows from `separators` and `lists_strings_column` do
  *        not match
  *
- * @param lists_strings_column Column containing lists of strings to concatenate.
- * @param separators Strings column that provides separators for concatenation.
- * @param separator_narep String that should be used to replace null separator, default is an
- *        invalid-scalar denoting that rows containing null separator will result in null string in
- *        the corresponding output rows.
- * @param string_narep String that should be used to replace null strings in any non-null list row,
- *        default is an invalid-scalar denoting that list rows containing null strings will result
- *        in null string in the corresponding output rows.
- * @param separate_nulls If YES, then the separator is included for null rows if `narep` is valid.
- * @param empty_list_policy if set to EMPTY_STRING, any input row that is an empty list will
+ * @param lists_strings_column Column containing lists of strings to concatenate
+ * @param separators Strings column that provides separators for concatenation
+ * @param separator_narep String that should be used to replace a null separator.
+ *        Default is an invalid-scalar denoting that rows containing null separator will result in
+ *        a null string in the corresponding output rows.
+ * @param string_narep String to replace null strings in any non-null list row.
+ *        Default is an invalid-scalar denoting that list rows containing null strings will result
+ *        in a null string in the corresponding output rows.
+ * @param separate_nulls If YES, then the separator is included for null rows if `narep` is valid
+ * @param empty_list_policy If set to EMPTY_STRING, any input row that is an empty list will
  *        result in an empty string. Otherwise, it will result in a null.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New strings column with concatenated results.
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings column with concatenated results
  */
 std::unique_ptr<column> join_list_elements(
   lists_column_view const& lists_strings_column,
@@ -264,6 +269,7 @@ std::unique_ptr<column> join_list_elements(
   string_scalar const& string_narep      = string_scalar("", false),
   separator_on_nulls separate_nulls      = separator_on_nulls::YES,
   output_if_empty_list empty_list_policy = output_if_empty_list::EMPTY_STRING,
+  rmm::cuda_stream_view stream           = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr    = rmm::mr::get_current_device_resource());
 
 /**
@@ -303,17 +309,18 @@ std::unique_ptr<column> join_list_elements(
  * @throw cudf::logic_error if input column is not lists of strings column.
  * @throw cudf::logic_error if separator is not valid.
  *
- * @param lists_strings_column Column containing lists of strings to concatenate.
- * @param separator String that should inserted between strings of each list row, default is an
- *        empty string.
- * @param narep String that should be used to replace null strings in any non-null list row, default
- *        is an invalid-scalar denoting that list rows containing null strings will result in null
- *        string in the corresponding output rows.
- * @param separate_nulls If YES, then the separator is included for null rows if `narep` is valid.
- * @param empty_list_policy if set to EMPTY_STRING, any input row that is an empty list will result
+ * @param lists_strings_column Column containing lists of strings to concatenate
+ * @param separator String to insert between strings of each list row.
+ *        Default is an empty string.
+ * @param narep String to replace null strings in any non-null list row.
+ *        Default is an invalid-scalar denoting that list rows containing null strings will result
+ *        in a null string in the corresponding output rows.
+ * @param separate_nulls If YES, then the separator is included for null rows if `narep` is valid
+ * @param empty_list_policy If set to EMPTY_STRING, any input row that is an empty list will result
  *        in an empty string. Otherwise, it will result in a null.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New strings column with concatenated results.
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings column with concatenated results
  */
 std::unique_ptr<column> join_list_elements(
   lists_column_view const& lists_strings_column,
@@ -321,6 +328,7 @@ std::unique_ptr<column> join_list_elements(
   string_scalar const& narep             = string_scalar("", false),
   separator_on_nulls separate_nulls      = separator_on_nulls::YES,
   output_if_empty_list empty_list_policy = output_if_empty_list::EMPTY_STRING,
+  rmm::cuda_stream_view stream           = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr    = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
diff --git a/cpp/include/cudf/strings/contains.hpp b/cpp/include/cudf/strings/contains.hpp
index 23c77cb60da..341c146df92 100644
--- a/cpp/include/cudf/strings/contains.hpp
+++ b/cpp/include/cudf/strings/contains.hpp
@@ -31,7 +31,7 @@ struct regex_program;
  * @addtogroup strings_contains
  * @{
  * @file strings/contains.hpp
- * @brief Strings APIs for regex contains, count, matches
+ * @brief Strings APIs for regex contains, count, matches, like
  */
 
 /**
@@ -50,14 +50,16 @@ struct regex_program;
  *
  * See the @ref md_regex "Regex Features" page for details on patterns supported by this API.
  *
- * @param strings Strings instance for this operation
+ * @param input Strings instance for this operation
  * @param prog Regex program instance
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return New column of boolean results for each string
  */
 std::unique_ptr<column> contains_re(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   regex_program const& prog,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -76,14 +78,16 @@ std::unique_ptr<column> contains_re(
  *
  * See the @ref md_regex "Regex Features" page for details on patterns supported by this API.
  *
- * @param strings Strings instance for this operation
+ * @param input Strings instance for this operation
  * @param prog Regex program instance
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return New column of boolean results for each string
  */
 std::unique_ptr<column> matches_re(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   regex_program const& prog,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -102,14 +106,16 @@ std::unique_ptr<column> matches_re(
  *
  * See the @ref md_regex "Regex Features" page for details on patterns supported by this API.
  *
- * @param strings Strings instance for this operation
+ * @param input Strings instance for this operation
  * @param prog Regex program instance
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return New column of match counts for each string
  */
 std::unique_ptr<column> count_re(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   regex_program const& prog,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -146,8 +152,9 @@ std::unique_ptr<column> count_re(
  *
  * @param input Strings instance for this operation
  * @param pattern Like pattern to match within each string
- * @param escape_character Optional character specifies the escape prefix;
- *                         default is no escape character
+ * @param escape_character Optional character specifies the escape prefix.
+ *                         Default is no escape character.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return New boolean column
  */
@@ -155,6 +162,7 @@ std::unique_ptr<column> like(
   strings_column_view const& input,
   string_scalar const& pattern,
   string_scalar const& escape_character = string_scalar(""),
+  rmm::cuda_stream_view stream          = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr   = rmm::mr::get_current_device_resource());
 
 /**
@@ -185,8 +193,9 @@ std::unique_ptr<column> like(
  *
  * @param input Strings instance for this operation
  * @param patterns Like patterns to match within each corresponding string
- * @param escape_character Optional character specifies the escape prefix;
- *                         default is no escape character
+ * @param escape_character Optional character specifies the escape prefix.
+ *                         Default is no escape character.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return New boolean column
  */
@@ -194,6 +203,7 @@ std::unique_ptr<column> like(
   strings_column_view const& input,
   strings_column_view const& patterns,
   string_scalar const& escape_character = string_scalar(""),
+  rmm::cuda_stream_view stream          = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr   = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
diff --git a/cpp/include/cudf/strings/convert/convert_booleans.hpp b/cpp/include/cudf/strings/convert/convert_booleans.hpp
index ab63503f166..9e9f25e800a 100644
--- a/cpp/include/cudf/strings/convert/convert_booleans.hpp
+++ b/cpp/include/cudf/strings/convert/convert_booleans.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -35,14 +35,16 @@ namespace strings {
  *
  * Any null entries will result in corresponding null entries in the output column.
  *
- * @param strings Strings instance for this operation.
- * @param true_string String to expect for true. Non-matching strings are false.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New BOOL8 column converted from strings.
+ * @param input Strings instance for this operation
+ * @param true_string String to expect for true. Non-matching strings are false
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New BOOL8 column converted from strings
  */
 std::unique_ptr<column> to_booleans(
-  strings_column_view const& strings,
-  string_scalar const& true_string    = string_scalar("true"),
+  strings_column_view const& input,
+  string_scalar const& true_string,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -53,16 +55,18 @@ std::unique_ptr<column> to_booleans(
  *
  * @throw cudf::logic_error if the input column is not BOOL8 type.
  *
- * @param booleans Boolean column to convert.
- * @param true_string String to use for true in the output column.
- * @param false_string String to use for false in the output column.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New strings column.
+ * @param booleans Boolean column to convert
+ * @param true_string String to use for true in the output column
+ * @param false_string String to use for false in the output column
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings column
  */
 std::unique_ptr<column> from_booleans(
   column_view const& booleans,
-  string_scalar const& true_string    = string_scalar("true"),
-  string_scalar const& false_string   = string_scalar("false"),
+  string_scalar const& true_string,
+  string_scalar const& false_string,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
diff --git a/cpp/include/cudf/strings/convert/convert_datetime.hpp b/cpp/include/cudf/strings/convert/convert_datetime.hpp
index fa729d26734..81cce14b53b 100644
--- a/cpp/include/cudf/strings/convert/convert_datetime.hpp
+++ b/cpp/include/cudf/strings/convert/convert_datetime.hpp
@@ -77,16 +77,18 @@ namespace strings {
  *
  * @throw cudf::logic_error if timestamp_type is not a timestamp type.
  *
- * @param strings Strings instance for this operation.
- * @param timestamp_type The timestamp type used for creating the output column.
- * @param format String specifying the timestamp format in strings.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New datetime column.
+ * @param input Strings instance for this operation
+ * @param timestamp_type The timestamp type used for creating the output column
+ * @param format String specifying the timestamp format in strings
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New datetime column
  */
 std::unique_ptr<column> to_timestamps(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   data_type timestamp_type,
   std::string_view format,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -124,14 +126,16 @@ std::unique_ptr<column> to_timestamps(
  * This will return a column of type BOOL8 where a `true` row indicates the corresponding
  * input string can be parsed correctly with the given format.
  *
- * @param strings Strings instance for this operation.
- * @param format String specifying the timestamp format in strings.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New BOOL8 column.
+ * @param input Strings instance for this operation
+ * @param format String specifying the timestamp format in strings
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New BOOL8 column
  */
 std::unique_ptr<column> is_timestamp(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   std::string_view format,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -231,19 +235,21 @@ std::unique_ptr<column> is_timestamp(
  * @throw cudf::logic_error if the `format` string is empty
  * @throw cudf::logic_error if `names.size()` is an invalid size. Must be 0 or 40 strings.
  *
- * @param timestamps Timestamp values to convert.
+ * @param timestamps Timestamp values to convert
  * @param format The string specifying output format.
  *        Default format is "%Y-%m-%dT%H:%M:%SZ".
  * @param names The string names to use for weekdays ("%a", "%A") and months ("%b", "%B")
  *        Default is an empty `strings_column_view`.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New strings column with formatted timestamps.
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings column with formatted timestamps
  */
 std::unique_ptr<column> from_timestamps(
   column_view const& timestamps,
   std::string_view format             = "%Y-%m-%dT%H:%M:%SZ",
   strings_column_view const& names    = strings_column_view(column_view{
     data_type{type_id::STRING}, 0, nullptr, nullptr, 0}),
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
diff --git a/cpp/include/cudf/strings/convert/convert_durations.hpp b/cpp/include/cudf/strings/convert/convert_durations.hpp
index e915ec26279..a1f4e4ead1d 100644
--- a/cpp/include/cudf/strings/convert/convert_durations.hpp
+++ b/cpp/include/cudf/strings/convert/convert_durations.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -65,16 +65,18 @@ namespace strings {
  *
  * @throw cudf::logic_error if duration_type is not a duration type.
  *
- * @param strings Strings instance for this operation.
- * @param duration_type The duration type used for creating the output column.
- * @param format String specifying the duration format in strings.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New duration column.
+ * @param input Strings instance for this operation
+ * @param duration_type The duration type used for creating the output column
+ * @param format String specifying the duration format in strings
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New duration column
  */
 std::unique_ptr<column> to_durations(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   data_type duration_type,
   std::string_view format,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -115,15 +117,17 @@ std::unique_ptr<column> to_durations(
  *
  * @throw cudf::logic_error if `durations` column parameter is not a duration type.
  *
- * @param durations Duration values to convert.
+ * @param durations Duration values to convert
  * @param format The string specifying output format.
- *        Default format is ""%d days %H:%M:%S".
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New strings column with formatted durations.
+ *        Default format is ""%D days %H:%M:%S".
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @return New strings column with formatted durations
  */
 std::unique_ptr<column> from_durations(
   column_view const& durations,
   std::string_view format             = "%D days %H:%M:%S",
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
diff --git a/cpp/include/cudf/strings/convert/convert_fixed_point.hpp b/cpp/include/cudf/strings/convert/convert_fixed_point.hpp
index 3852dc8e81a..8f37715967a 100644
--- a/cpp/include/cudf/strings/convert/convert_fixed_point.hpp
+++ b/cpp/include/cudf/strings/convert/convert_fixed_point.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -53,14 +53,16 @@ namespace strings {
  *
  * @throw cudf::logic_error if `output_type` is not a fixed-point decimal type.
  *
- * @param input Strings instance for this operation.
- * @param output_type Type of fixed-point column to return including the scale value.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New column of `output_type`.
+ * @param input Strings instance for this operation
+ * @param output_type Type of fixed-point column to return including the scale value
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New column of `output_type`
  */
 std::unique_ptr<column> to_fixed_point(
   strings_column_view const& input,
   data_type output_type,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -83,12 +85,14 @@ std::unique_ptr<column> to_fixed_point(
  *
  * @throw cudf::logic_error if the `input` column is not a fixed-point decimal type.
  *
- * @param input Fixed-point column to convert.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New strings column.
+ * @param input Fixed-point column to convert
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings column
  */
 std::unique_ptr<column> from_fixed_point(
   column_view const& input,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -111,14 +115,16 @@ std::unique_ptr<column> from_fixed_point(
  *
  * @throw cudf::logic_error if the `decimal_type` is not a fixed-point decimal type.
  *
- * @param input Strings instance for this operation.
- * @param decimal_type Fixed-point type (with scale) used only for checking overflow.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New column of boolean results for each string.
+ * @param input Strings instance for this operation
+ * @param decimal_type Fixed-point type (with scale) used only for checking overflow
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New column of boolean results for each string
  */
 std::unique_ptr<column> is_fixed_point(
   strings_column_view const& input,
   data_type decimal_type              = data_type{type_id::DECIMAL64},
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
diff --git a/cpp/include/cudf/strings/convert/convert_floats.hpp b/cpp/include/cudf/strings/convert/convert_floats.hpp
index 38a84fc1548..a35cb68ef4e 100644
--- a/cpp/include/cudf/strings/convert/convert_floats.hpp
+++ b/cpp/include/cudf/strings/convert/convert_floats.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -39,14 +39,16 @@ namespace strings {
  *
  * @throw cudf::logic_error if output_type is not float type.
  *
- * @param strings Strings instance for this operation.
- * @param output_type Type of float numeric column to return.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New column with floats converted from strings.
+ * @param strings Strings instance for this operation
+ * @param output_type Type of float numeric column to return
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New column with floats converted from strings
  */
 std::unique_ptr<column> to_floats(
   strings_column_view const& strings,
   data_type output_type,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -62,12 +64,14 @@ std::unique_ptr<column> to_floats(
  *
  * @throw cudf::logic_error if floats column is not float type.
  *
- * @param floats Numeric column to convert.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New strings column with floats as strings.
+ * @param floats Numeric column to convert
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings column with floats as strings
  */
 std::unique_ptr<column> from_floats(
   column_view const& floats,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -86,12 +90,14 @@ std::unique_ptr<column> from_floats(
  *
  * Any null row results in a null entry for that row in the output column.
  *
- * @param strings Strings instance for this operation.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New column of boolean results for each string.
+ * @param input Strings instance for this operation
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New column of boolean results for each string
  */
 std::unique_ptr<column> is_float(
-  strings_column_view const& strings,
+  strings_column_view const& input,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
diff --git a/cpp/include/cudf/strings/convert/convert_integers.hpp b/cpp/include/cudf/strings/convert/convert_integers.hpp
index 44213b84139..74ec5d315a2 100644
--- a/cpp/include/cudf/strings/convert/convert_integers.hpp
+++ b/cpp/include/cudf/strings/convert/convert_integers.hpp
@@ -46,14 +46,16 @@ namespace strings {
  *
  * @throw cudf::logic_error if output_type is not integral type.
  *
- * @param strings Strings instance for this operation.
- * @param output_type Type of integer numeric column to return.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New column with integers converted from strings.
+ * @param input Strings instance for this operation
+ * @param output_type Type of integer numeric column to return
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New column with integers converted from strings
  */
 std::unique_ptr<column> to_integers(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   data_type output_type,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -67,12 +69,14 @@ std::unique_ptr<column> to_integers(
  *
  * @throw cudf::logic_error if integers column is not integral type.
  *
- * @param integers Numeric column to convert.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New strings column with integers as strings.
+ * @param integers Numeric column to convert
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings column with integers as strings
  */
 std::unique_ptr<column> from_integers(
   column_view const& integers,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -94,12 +98,14 @@ std::unique_ptr<column> from_integers(
  *
  * Any null row results in a null entry for that row in the output column.
  *
- * @param strings  Strings instance for this operation.
- * @param mr       Device memory resource used to allocate the returned column's device memory.
- * @return         New column of boolean results for each string.
+ * @param input Strings instance for this operation
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New column of boolean results for each string
  */
 std::unique_ptr<column> is_integer(
-  strings_column_view const& strings,
+  strings_column_view const& input,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -124,14 +130,16 @@ std::unique_ptr<column> is_integer(
  *
  * Any null row results in a null entry for that row in the output column.
  *
- * @param strings  Strings instance for this operation.
- * @param int_type Integer type used for checking underflow and overflow.
- * @param mr       Device memory resource used to allocate the returned column's device memory.
- * @return         New column of boolean results for each string.
+ * @param input Strings instance for this operation
+ * @param int_type Integer type used for checking underflow and overflow
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New column of boolean results for each string
  */
 std::unique_ptr<column> is_integer(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   data_type int_type,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -152,14 +160,16 @@ std::unique_ptr<column> is_integer(
  *
  * @throw cudf::logic_error if output_type is not integral type.
  *
- * @param strings Strings instance for this operation.
- * @param output_type Type of integer numeric column to return.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New column with integers converted from strings.
+ * @param input Strings instance for this operation
+ * @param output_type Type of integer numeric column to return
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New column with integers converted from strings
  */
 std::unique_ptr<column> hex_to_integers(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   data_type output_type,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -179,12 +189,14 @@ std::unique_ptr<column> hex_to_integers(
  *
  * Any null row results in a null entry for that row in the output column.
  *
- * @param strings Strings instance for this operation.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New column of boolean results for each string.
+ * @param input Strings instance for this operation
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New column of boolean results for each string
  */
 std::unique_ptr<column> is_hex(
-  strings_column_view const& strings,
+  strings_column_view const& input,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -199,23 +211,25 @@ std::unique_ptr<column> is_hex(
  *
  * @code{.pseudo}
  * Example:
- * input = [123, -1, 0, 27, 342718233] // int32 type input column
+ * input = [1234, -1, 0, 27, 342718233] // int32 type input column
  * s = integers_to_hex(input)
  * s is [ '04D2', 'FFFFFFFF', '00', '1B', '146D7719']
  * @endcode
  *
  * The example above shows an `INT32` type column where each integer is 4 bytes.
  * Leading zeros are suppressed unless filling out a complete byte as in
- * `123 -> '04D2'` instead of `000004D2` or `4D2`.
+ * `1234 -> '04D2'` instead of `000004D2` or `4D2`.
  *
  * @throw cudf::logic_error if the input column is not integral type.
  *
- * @param input Integer column to convert to hex.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New strings column with hexadecimal characters.
+ * @param input Integer column to convert to hex
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings column with hexadecimal characters
  */
 std::unique_ptr<column> integers_to_hex(
   column_view const& input,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
diff --git a/cpp/include/cudf/strings/convert/convert_ipv4.hpp b/cpp/include/cudf/strings/convert/convert_ipv4.hpp
index 22272af74fc..25ad7b86748 100644
--- a/cpp/include/cudf/strings/convert/convert_ipv4.hpp
+++ b/cpp/include/cudf/strings/convert/convert_ipv4.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -48,12 +48,14 @@ namespace strings {
  *
  * Any null entries will result in corresponding null entries in the output column.
  *
- * @param strings Strings instance for this operation.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New INT64 column converted from strings.
+ * @param input Strings instance for this operation
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New INT64 column converted from strings
  */
 std::unique_ptr<column> ipv4_to_integers(
-  strings_column_view const& strings,
+  strings_column_view const& input,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -71,12 +73,14 @@ std::unique_ptr<column> ipv4_to_integers(
  *
  * @throw cudf::logic_error if the input column is not INT64 type.
  *
- * @param integers Integer (INT64) column to convert.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New strings column.
+ * @param integers Integer (INT64) column to convert
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings column
  */
 std::unique_ptr<column> integers_to_ipv4(
   column_view const& integers,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -96,12 +100,14 @@ std::unique_ptr<column> integers_to_ipv4(
  *
  * Any null row results in a null entry for that row in the output column.
  *
- * @param strings Strings instance for this operation.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New column of boolean results for each string.
+ * @param input Strings instance for this operation
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New column of boolean results for each string
  */
 std::unique_ptr<column> is_ipv4(
-  strings_column_view const& strings,
+  strings_column_view const& input,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
diff --git a/cpp/include/cudf/strings/convert/convert_lists.hpp b/cpp/include/cudf/strings/convert/convert_lists.hpp
index 7ab1bf47b0a..dedf4e95138 100644
--- a/cpp/include/cudf/strings/convert/convert_lists.hpp
+++ b/cpp/include/cudf/strings/convert/convert_lists.hpp
@@ -50,17 +50,19 @@ namespace strings {
  *
  * @throw cudf::logic_error if the input column is not a LIST type with a STRING child.
  *
- * @param input Lists column to format.
- * @param na_rep Replacement string for null elements.
- * @param separators Strings to use for enclosing list components and separating elements.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New strings column.
+ * @param input Lists column to format
+ * @param na_rep Replacement string for null elements
+ * @param separators Strings to use for enclosing list components and separating elements
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings column
  */
 std::unique_ptr<column> format_list_column(
   lists_column_view const& input,
-  string_scalar const& na_rep           = string_scalar("NULL"),
+  string_scalar const& na_rep           = string_scalar(""),
   strings_column_view const& separators = strings_column_view(column_view{
     data_type{type_id::STRING}, 0, nullptr, nullptr, 0}),
+  rmm::cuda_stream_view stream          = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr   = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
diff --git a/cpp/include/cudf/strings/convert/convert_urls.hpp b/cpp/include/cudf/strings/convert/convert_urls.hpp
index 7f29a0d2149..902835081af 100644
--- a/cpp/include/cudf/strings/convert/convert_urls.hpp
+++ b/cpp/include/cudf/strings/convert/convert_urls.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -39,12 +39,14 @@ namespace strings {
  *
  * Any null entries will result in corresponding null entries in the output column.
  *
- * @param strings Strings instance for this operation.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New strings column.
+ * @param input Strings instance for this operation
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings column
  */
 std::unique_ptr<column> url_encode(
-  strings_column_view const& strings,
+  strings_column_view const& input,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -60,12 +62,14 @@ std::unique_ptr<column> url_encode(
  *
  * Any null entries will result in corresponding null entries in the output column.
  *
- * @param strings Strings instance for this operation.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New strings column.
+ * @param input Strings instance for this operation
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings column
  */
 std::unique_ptr<column> url_decode(
-  strings_column_view const& strings,
+  strings_column_view const& input,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
diff --git a/cpp/include/cudf/strings/detail/merge.cuh b/cpp/include/cudf/strings/detail/merge.cuh
index 965e89cc862..5f50faa158e 100644
--- a/cpp/include/cudf/strings/detail/merge.cuh
+++ b/cpp/include/cudf/strings/detail/merge.cuh
@@ -18,8 +18,8 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/merge.hpp>
 #include <cudf/detail/null_mask.hpp>
-#include <cudf/merge.hpp>
 #include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
diff --git a/cpp/include/cudf/strings/detail/scan.hpp b/cpp/include/cudf/strings/detail/scan.hpp
new file mode 100644
index 00000000000..611e32e28cd
--- /dev/null
+++ b/cpp/include/cudf/strings/detail/scan.hpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/utilities/default_stream.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+namespace cudf {
+namespace strings {
+namespace detail {
+/**
+ * @brief Scan function for strings
+ *
+ * Called by cudf::scan() with only min and max aggregates.
+ *
+ * @tparam Op Either DeviceMin or DeviceMax operations
+ *
+ * @param input Input strings column
+ * @param mask Mask for scan
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings column
+ */
+template <typename Op>
+std::unique_ptr<column> scan_inclusive(column_view const& input,
+                                       bitmask_type const* mask,
+                                       rmm::cuda_stream_view stream,
+                                       rmm::mr::device_memory_resource* mr);
+
+}  // namespace detail
+}  // namespace strings
+}  // namespace cudf
diff --git a/cpp/include/cudf/strings/extract.hpp b/cpp/include/cudf/strings/extract.hpp
index 586cb1f3f26..a4db1ac46da 100644
--- a/cpp/include/cudf/strings/extract.hpp
+++ b/cpp/include/cudf/strings/extract.hpp
@@ -53,14 +53,16 @@ struct regex_program;
  *
  * See the @ref md_regex "Regex Features" page for details on patterns supported by this API.
  *
- * @param strings Strings instance for this operation
+ * @param input Strings instance for this operation
  * @param prog Regex program instance
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table's device memory
  * @return Columns of strings extracted from the input column
  */
 std::unique_ptr<table> extract(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   regex_program const& prog,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -87,14 +89,16 @@ std::unique_ptr<table> extract(
  *
  * See the @ref md_regex "Regex Features" page for details on patterns supported by this API.
  *
- * @param strings Strings instance for this operation
+ * @param input Strings instance for this operation
  * @param prog Regex program instance
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate any returned device memory
  * @return Lists column containing strings extracted from the input column
  */
 std::unique_ptr<column> extract_all_record(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   regex_program const& prog,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
diff --git a/cpp/include/cudf/strings/padding.hpp b/cpp/include/cudf/strings/padding.hpp
index 7699159fbea..f0cb351eeda 100644
--- a/cpp/include/cudf/strings/padding.hpp
+++ b/cpp/include/cudf/strings/padding.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -51,6 +51,7 @@ namespace strings {
  *        Default is pad right (left justify)
  * @param fill_char Single UTF-8 character to use for padding;
  *        Default is the space character
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return New column with padded strings
  */
@@ -59,6 +60,7 @@ std::unique_ptr<column> pad(
   size_type width,
   side_type side                      = side_type::RIGHT,
   std::string_view fill_char          = " ",
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -79,14 +81,16 @@ std::unique_ptr<column> pad(
  * r is now ['001234','-09876','+00.34','-342567', '0002+2']
  * @endcode
  *
- * @param input Strings instance for this operation.
- * @param width The minimum number of characters for each string.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New column of strings.
+ * @param input Strings instance for this operation
+ * @param width The minimum number of characters for each string
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New column of strings
  */
 std::unique_ptr<column> zfill(
   strings_column_view const& input,
   size_type width,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
diff --git a/cpp/include/cudf/strings/repeat_strings.hpp b/cpp/include/cudf/strings/repeat_strings.hpp
index 2b6575f80d0..7dc9c33f579 100644
--- a/cpp/include/cudf/strings/repeat_strings.hpp
+++ b/cpp/include/cudf/strings/repeat_strings.hpp
@@ -52,12 +52,14 @@ namespace strings {
  *
  * @param input The scalar containing the string to repeat
  * @param repeat_times The number of times the input string is repeated
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned string scalar
  * @return New string scalar in which the input string is repeated
  */
 std::unique_ptr<string_scalar> repeat_string(
   string_scalar const& input,
   size_type repeat_times,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -81,12 +83,14 @@ std::unique_ptr<string_scalar> repeat_string(
  *
  * @param input The column containing strings to repeat
  * @param repeat_times The number of times each input string is repeated
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned strings column
  * @return New column containing the repeated strings
  */
 std::unique_ptr<column> repeat_strings(
   strings_column_view const& input,
   size_type repeat_times,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -115,13 +119,15 @@ std::unique_ptr<column> repeat_strings(
  *
  * @param input The column containing strings to repeat
  * @param repeat_times The column containing numbers of times that the corresponding input strings
- *                     are repeated
+ *                     for each row are repeated
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned strings column
  * @return New column containing the repeated strings.
  */
 std::unique_ptr<column> repeat_strings(
   strings_column_view const& input,
   column_view const& repeat_times,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
diff --git a/cpp/include/cudf/strings/replace.hpp b/cpp/include/cudf/strings/replace.hpp
index 22818f7542e..2476a41e886 100644
--- a/cpp/include/cudf/strings/replace.hpp
+++ b/cpp/include/cudf/strings/replace.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -54,19 +54,21 @@ namespace strings {
  *
  * @throw cudf::logic_error if target is an empty string.
  *
- * @param strings Strings column for this operation.
- * @param target String to search for within each string.
- * @param repl Replacement string if target is found.
+ * @param input Strings column for this operation
+ * @param target String to search for within each string
+ * @param repl Replacement string if target is found
  * @param maxrepl Maximum times to replace if target appears multiple times in the input string.
  *        Default of -1 specifies replace all occurrences of target in each string.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New strings column.
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings column
  */
 std::unique_ptr<column> replace(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   string_scalar const& target,
   string_scalar const& repl,
-  int32_t maxrepl                     = -1,
+  cudf::size_type maxrepl             = -1,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -92,21 +94,23 @@ std::unique_ptr<column> replace(
  *
  * @throw cudf::logic_error if start is greater than stop.
  *
- * @param strings Strings column for this operation.
+ * @param input Strings column for this operation.
  * @param repl Replacement string for specified positions found.
  *        Default is empty string.
  * @param start Start position where repl will be added.
  *        Default is 0, first character position.
  * @param stop End position (exclusive) to use for replacement.
  *        Default of -1 specifies the end of each string.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New strings column.
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings column
  */
 std::unique_ptr<column> replace_slice(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   string_scalar const& repl           = string_scalar(""),
   size_type start                     = 0,
   size_type stop                      = -1,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -141,16 +145,18 @@ std::unique_ptr<column> replace_slice(
  * if repls is a single string.
  * @throw cudf::logic_error if targets or repls contain null entries.
  *
- * @param strings Strings column for this operation.
- * @param targets Strings to search for in each string.
- * @param repls Corresponding replacement strings for target strings.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New strings column.
+ * @param input Strings column for this operation
+ * @param targets Strings to search for in each string
+ * @param repls Corresponding replacement strings for target strings
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings column
  */
 std::unique_ptr<column> replace(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   strings_column_view const& targets,
   strings_column_view const& repls,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
diff --git a/cpp/include/cudf/strings/replace_re.hpp b/cpp/include/cudf/strings/replace_re.hpp
index bc6659835c3..77db2882253 100644
--- a/cpp/include/cudf/strings/replace_re.hpp
+++ b/cpp/include/cudf/strings/replace_re.hpp
@@ -43,20 +43,22 @@ struct regex_program;
  *
  * See the @ref md_regex "Regex Features" page for details on patterns supported by this API.
  *
- * @param strings Strings instance for this operation
+ * @param input Strings instance for this operation
  * @param prog Regex program instance
  * @param replacement The string used to replace the matched sequence in each string.
  *        Default is an empty string.
  * @param max_replace_count The maximum number of times to replace the matched pattern
  *        within each string. Default replaces every substring that is matched.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return New strings column
  */
 std::unique_ptr<column> replace_re(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   regex_program const& prog,
   string_scalar const& replacement           = string_scalar(""),
   std::optional<size_type> max_replace_count = std::nullopt,
+  rmm::cuda_stream_view stream               = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr        = rmm::mr::get_current_device_resource());
 
 /**
@@ -67,18 +69,20 @@ std::unique_ptr<column> replace_re(
  *
  * See the @ref md_regex "Regex Features" page for details on patterns supported by this API.
  *
- * @param strings Strings instance for this operation.
- * @param patterns The regular expression patterns to search within each string.
- * @param replacements The strings used for replacement.
- * @param flags Regex flags for interpreting special characters in the patterns.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New strings column.
+ * @param input Strings instance for this operation
+ * @param patterns The regular expression patterns to search within each string
+ * @param replacements The strings used for replacement
+ * @param flags Regex flags for interpreting special characters in the patterns
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings column
  */
 std::unique_ptr<column> replace_re(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   std::vector<std::string> const& patterns,
   strings_column_view const& replacements,
   regex_flags const flags             = regex_flags::DEFAULT,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -92,16 +96,18 @@ std::unique_ptr<column> replace_re(
  * @throw cudf::logic_error if capture index values in `replacement` are not in range 0-99, and also
  * if the index exceeds the group count specified in the pattern
  *
- * @param strings Strings instance for this operation
+ * @param input Strings instance for this operation
  * @param prog Regex program instance
  * @param replacement The replacement template for creating the output string
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return New strings column
  */
 std::unique_ptr<column> replace_with_backrefs(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   regex_program const& prog,
   std::string_view replacement,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/reverse.hpp b/cpp/include/cudf/strings/reverse.hpp
index 26fb36a540e..4fc8fbf67c2 100644
--- a/cpp/include/cudf/strings/reverse.hpp
+++ b/cpp/include/cudf/strings/reverse.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -42,10 +42,12 @@ namespace strings {
  *
  * @param input Strings column for this operation
  * @param mr Device memory resource used to allocate the returned column's device memory
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @return New strings column
  */
 std::unique_ptr<column> reverse(
   strings_column_view const& input,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
diff --git a/cpp/include/cudf/strings/slice.hpp b/cpp/include/cudf/strings/slice.hpp
index 5f2c71725eb..f106663be9b 100644
--- a/cpp/include/cudf/strings/slice.hpp
+++ b/cpp/include/cudf/strings/slice.hpp
@@ -50,18 +50,20 @@ namespace strings {
  * r2 is now ["lo","ob"]
  * @endcode
  *
- * @param strings Strings column for this operation.
- * @param start First character position to begin the substring.
- * @param stop Last character position (exclusive) to end the substring.
- * @param step Distance between input characters retrieved.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New strings column with sorted elements of this instance.
+ * @param input Strings column for this operation
+ * @param start First character position to begin the substring
+ * @param stop Last character position (exclusive) to end the substring
+ * @param step Distance between input characters retrieved
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings column with sorted elements of this instance
  */
 std::unique_ptr<column> slice_strings(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   numeric_scalar<size_type> const& start = numeric_scalar<size_type>(0, false),
   numeric_scalar<size_type> const& stop  = numeric_scalar<size_type>(0, false),
   numeric_scalar<size_type> const& step  = numeric_scalar<size_type>(1),
+  rmm::cuda_stream_view stream           = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr    = rmm::mr::get_current_device_resource());
 
 /**
@@ -95,16 +97,18 @@ std::unique_ptr<column> slice_strings(
  * @throw cudf::logic_error if starts and stops are not same integer type.
  * @throw cudf::logic_error if starts or stops contains nulls.
  *
- * @param strings Strings column for this operation.
- * @param starts First character positions to begin the substring.
- * @param stops Last character (exclusive) positions to end the substring.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New strings column with sorted elements of this instance.
+ * @param input Strings column for this operation
+ * @param starts First character positions to begin the substring
+ * @param stops Last character (exclusive) positions to end the substring
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings column with sorted elements of this instance
  */
 std::unique_ptr<column> slice_strings(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   column_view const& starts,
   column_view const& stops,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
diff --git a/cpp/include/cudf/strings/split/partition.hpp b/cpp/include/cudf/strings/split/partition.hpp
index 52ffb735eb7..25eedf1e86b 100644
--- a/cpp/include/cudf/strings/split/partition.hpp
+++ b/cpp/include/cudf/strings/split/partition.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -51,15 +51,17 @@ namespace strings {
  * r[2] is ["cd","g_h"]
  * @endcode
  *
- * @param strings Strings instance for this operation.
+ * @param input Strings instance for this operation
  * @param delimiter UTF-8 encoded string indicating where to split each string.
  *        Default of empty string indicates split on whitespace.
- * @param mr Device memory resource used to allocate the returned table's device memory.
- * @return New table of strings columns.
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned table's device memory
+ * @return New table of strings columns
  */
 std::unique_ptr<table> partition(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   string_scalar const& delimiter      = string_scalar(""),
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -83,15 +85,17 @@ std::unique_ptr<table> partition(
  * r[2] is ["cd","h"]
  * @endcode
  *
- * @param strings Strings instance for this operation.
+ * @param input Strings instance for this operation
  * @param delimiter UTF-8 encoded string indicating where to split each string.
  *        Default of empty string indicates split on whitespace.
- * @param mr Device memory resource used to allocate the returned table's device memory.
- * @return New strings columns.
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned table's device memory
+ * @return New strings columns
  */
 std::unique_ptr<table> rpartition(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   string_scalar const& delimiter      = string_scalar(""),
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
diff --git a/cpp/include/cudf/strings/split/split_re.hpp b/cpp/include/cudf/strings/split/split_re.hpp
index 14fcfaecdcd..f1736cb7e0c 100644
--- a/cpp/include/cudf/strings/split/split_re.hpp
+++ b/cpp/include/cudf/strings/split/split_re.hpp
@@ -75,6 +75,7 @@ struct regex_program;
  * @param prog Regex program instance
  * @param maxsplit Maximum number of splits to perform.
  *        Default of -1 indicates all possible splits on each string.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned result's device memory
  * @return A table of columns of strings
  */
@@ -82,6 +83,7 @@ std::unique_ptr<table> split_re(
   strings_column_view const& input,
   regex_program const& prog,
   size_type maxsplit                  = -1,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -125,17 +127,19 @@ std::unique_ptr<table> split_re(
  *
  * @throw cudf::logic_error if `pattern` is empty.
  *
- * @param input A column of string elements to be split.
+ * @param input A column of string elements to be split
  * @param prog Regex program instance
  * @param maxsplit Maximum number of splits to perform.
  *        Default of -1 indicates all possible splits on each string.
- * @param mr Device memory resource used to allocate the returned result's device memory.
- * @return A table of columns of strings.
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned result's device memory
+ * @return A table of columns of strings
  */
 std::unique_ptr<table> rsplit_re(
   strings_column_view const& input,
   regex_program const& prog,
   size_type maxsplit                  = -1,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -185,13 +189,15 @@ std::unique_ptr<table> rsplit_re(
  * @param prog Regex program instance
  * @param maxsplit Maximum number of splits to perform.
  *        Default of -1 indicates all possible splits on each string.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned result's device memory
- * @return Lists column of strings.
+ * @return Lists column of strings
  */
 std::unique_ptr<column> split_record_re(
   strings_column_view const& input,
   regex_program const& prog,
   size_type maxsplit                  = -1,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -243,6 +249,7 @@ std::unique_ptr<column> split_record_re(
  * @param prog Regex program instance
  * @param maxsplit Maximum number of splits to perform.
  *        Default of -1 indicates all possible splits on each string.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned result's device memory
  * @return Lists column of strings
  */
@@ -250,6 +257,7 @@ std::unique_ptr<column> rsplit_record_re(
   strings_column_view const& input,
   regex_program const& prog,
   size_type maxsplit                  = -1,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
diff --git a/cpp/include/cudf/strings/strip.hpp b/cpp/include/cudf/strings/strip.hpp
index adf3b291144..556d6805ac3 100644
--- a/cpp/include/cudf/strings/strip.hpp
+++ b/cpp/include/cudf/strings/strip.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -57,6 +57,7 @@ namespace strings {
  *        string; Default is both
  * @param to_strip UTF-8 encoded characters to strip from each string;
  *        Default is empty string which indicates strip whitespace characters
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New strings column.
  */
@@ -64,6 +65,7 @@ std::unique_ptr<column> strip(
   strings_column_view const& input,
   side_type side                      = side_type::BOTH,
   string_scalar const& to_strip       = string_scalar(""),
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
diff --git a/cpp/include/cudf/strings/translate.hpp b/cpp/include/cudf/strings/translate.hpp
index 0cbf6b22029..4bd09352b09 100644
--- a/cpp/include/cudf/strings/translate.hpp
+++ b/cpp/include/cudf/strings/translate.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -47,14 +47,16 @@ namespace strings {
  * r is now ["AA", "", "cccc", "AcQ"]
  * @endcode
  *
- * @param strings Strings instance for this operation.
- * @param chars_table Table of UTF-8 character mappings.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New column with padded strings.
+ * @param input Strings instance for this operation
+ * @param chars_table Table of UTF-8 character mappings
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New column with padded strings
  */
 std::unique_ptr<column> translate(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   std::vector<std::pair<char_utf8, char_utf8>> const& chars_table,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -87,19 +89,21 @@ enum class filter_type : bool {
  *
  * @throw cudf::logic_error if `replacement` is invalid
  *
- * @param strings Strings instance for this operation.
- * @param characters_to_filter Table of character ranges to filter on.
+ * @param input Strings instance for this operation
+ * @param characters_to_filter Table of character ranges to filter on
  * @param keep_characters If true, the `characters_to_filter` are retained and all other characters
- * are removed.
- * @param replacement Optional replacement string for each character removed.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New column with filtered strings.
+ * are removed
+ * @param replacement Optional replacement string for each character removed
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New column with filtered strings
  */
 std::unique_ptr<column> filter_characters(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   std::vector<std::pair<cudf::char_utf8, cudf::char_utf8>> characters_to_filter,
   filter_type keep_characters         = filter_type::KEEP,
   string_scalar const& replacement    = string_scalar(""),
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
diff --git a/cpp/include/cudf/strings/wrap.hpp b/cpp/include/cudf/strings/wrap.hpp
index 8d2d43c7f0f..efdc3e62aff 100644
--- a/cpp/include/cudf/strings/wrap.hpp
+++ b/cpp/include/cudf/strings/wrap.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -57,14 +57,16 @@ namespace strings {
  * wrapped_string_tbl = ["the quick\nbrown fox\njumped over\nthe lazy\nbrown dog", "hello, world"]
  * ```
  *
- * @param[in] strings String column.
- * @param[in] width Maximum character width of a line within each string.
- * @param[in] mr Device memory resource used to allocate the returned column's device memory
- * @return Column of wrapped strings.
+ * @param input String column
+ * @param width Maximum character width of a line within each string
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return Column of wrapped strings
  */
 std::unique_ptr<column> wrap(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   size_type width,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
diff --git a/cpp/include/cudf/strings/detail/json.hpp b/cpp/include/cudf/structs/detail/scan.hpp
similarity index 55%
rename from cpp/include/cudf/strings/detail/json.hpp
rename to cpp/include/cudf/structs/detail/scan.hpp
index 0fb06d36570..531e0a6c65f 100644
--- a/cpp/include/cudf/strings/detail/json.hpp
+++ b/cpp/include/cudf/structs/detail/scan.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,31 +13,33 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 #pragma once
 
-#include <cudf/scalar/scalar.hpp>
-#include <cudf/strings/json.hpp>
-#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/column/column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
-namespace strings {
+namespace structs {
 namespace detail {
-
 /**
- * @copydoc cudf::strings::get_json_object
+ * @brief Scan function for struct column type
+ *
+ * Called by cudf::scan() with only min and max aggregates.
+ *
+ * @tparam Op Either DeviceMin or DeviceMax operations
  *
+ * @param input Input column
  * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New struct column
  */
-std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& col,
-                                              cudf::string_scalar const& json_path,
-                                              cudf::strings::get_json_object_options options,
-                                              rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr);
+template <typename Op>
+std::unique_ptr<column> scan_inclusive(column_view const& input,
+                                       rmm::cuda_stream_view stream,
+                                       rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
-}  // namespace strings
+}  // namespace structs
 }  // namespace cudf
diff --git a/cpp/include/cudf/table/experimental/row_operators.cuh b/cpp/include/cudf/table/experimental/row_operators.cuh
index 6b024d902a9..6946ccdb213 100644
--- a/cpp/include/cudf/table/experimental/row_operators.cuh
+++ b/cpp/include/cudf/table/experimental/row_operators.cuh
@@ -52,6 +52,7 @@
 #include <limits>
 #include <memory>
 #include <optional>
+#include <type_traits>
 #include <utility>
 
 namespace cudf {
@@ -264,6 +265,7 @@ template <bool has_nested_columns,
           typename Nullate,
           typename PhysicalElementComparator = sorting_physical_element_comparator>
 class device_row_comparator {
+ public:
   friend class self_comparator;       ///< Allow self_comparator to access private members
   friend class two_table_comparator;  ///< Allow two_table_comparator to access private members
 
@@ -274,6 +276,8 @@ class device_row_comparator {
    * @param check_nulls Indicates if any input column contains nulls.
    * @param lhs The first table
    * @param rhs The second table (may be the same table as `lhs`)
+   * @param l_dremel_device_views lhs table dremel device view for list type
+   * @param r_dremel_device_views rhs table dremel device view for list type
    * @param depth Optional, device array the same length as a row that contains starting depths of
    * columns if they're nested, and 0 otherwise.
    * @param column_order Optional, device array the same length as a row that indicates the desired
@@ -305,6 +309,44 @@ class device_row_comparator {
   {
   }
 
+  /**
+   * @brief Construct a function object for performing a lexicographic
+   * comparison between the rows of two tables.
+   * This is a special overload to allow device-side construction of the
+   * comparator for cases where no preprocessing is needed, i.e. tables with
+   * non-nested type columns.
+   *
+   * @param check_nulls Indicates if any input column contains nulls.
+   * @param lhs The first table
+   * @param rhs The second table (may be the same table as `lhs`)
+   * @param column_order Optional, device array the same length as a row that indicates the desired
+   * ascending/descending order of each column in a row. If `nullopt`, it is assumed all columns are
+   * sorted in ascending order.
+   * @param null_precedence Optional, device array the same length as a row and indicates how null
+   * values compare to all other for every column. If `nullopt`, then null precedence would be
+   * `null_order::BEFORE` for all columns.
+   * @param comparator Physical element relational comparison functor.
+   */
+  template <bool nested_disable = not has_nested_columns, CUDF_ENABLE_IF(nested_disable)>
+  __device__ device_row_comparator(
+    Nullate check_nulls,
+    table_device_view lhs,
+    table_device_view rhs,
+    std::optional<device_span<order const>> column_order         = std::nullopt,
+    std::optional<device_span<null_order const>> null_precedence = std::nullopt,
+    PhysicalElementComparator comparator                         = {}) noexcept
+    : _lhs{lhs},
+      _rhs{rhs},
+      _l_dremel{},
+      _r_dremel{},
+      _check_nulls{check_nulls},
+      _depth{},
+      _column_order{column_order},
+      _null_precedence{null_precedence},
+      _comparator{comparator}
+  {
+  }
+
   /**
    * @brief Performs a relational comparison between two elements in two columns.
    */
@@ -323,6 +365,8 @@ class device_row_comparator {
      * @param depth The depth of the column if part of a nested column @see
      * preprocessed_table::depths
      * @param comparator Physical element relational comparison functor.
+     * @param l_dremel_device_view <>
+     * @param r_dremel_device_view <>
      */
     __device__ element_comparator(Nullate check_nulls,
                                   column_device_view lhs,
@@ -370,6 +414,13 @@ class device_row_comparator {
                              std::numeric_limits<int>::max());
     }
 
+    /**
+     * @brief Throws run-time error when columns types cannot be compared
+     *        or if this class is instantiated with `has_nested_columns = false` but
+     *        passed tables with nested columns
+     *
+     * @return Ordering
+     */
     template <typename Element,
               CUDF_ENABLE_IF(not cudf::is_relationally_comparable<Element, Element>() and
                              (not has_nested_columns or not cudf::is_nested<Element>()))>
@@ -379,6 +430,14 @@ class device_row_comparator {
       CUDF_UNREACHABLE("Attempted to compare elements of uncomparable types.");
     }
 
+    /**
+     * @brief Compares two struct-type columns
+     *
+     * @param lhs_element_index The index of the first element
+     * @param rhs_element_index The index of the second element
+     * @return Indicates the relationship between the elements in the `lhs` and `rhs` columns, along
+     * with the depth at which a null value was encountered.
+     */
     template <typename Element,
               CUDF_ENABLE_IF(has_nested_columns and std::is_same_v<Element, cudf::struct_view>)>
     __device__ cuda::std::pair<weak_ordering, int> operator()(
@@ -413,6 +472,14 @@ class device_row_comparator {
         rhs_element_index);
     }
 
+    /**
+     * @brief Compares two list-type columns
+     *
+     * @param lhs_element_index The index of the first element
+     * @param rhs_element_index The index of the second element
+     * @return Indicates the relationship between the elements in the `lhs` and `rhs` columns, along
+     * with the depth at which a null value was encountered.
+     */
     template <typename Element,
               CUDF_ENABLE_IF(has_nested_columns and std::is_same_v<Element, cudf::list_view>)>
     __device__ cuda::std::pair<weak_ordering, int> operator()(size_type lhs_element_index,
diff --git a/cpp/include/cudf/table/table_view.hpp b/cpp/include/cudf/table/table_view.hpp
index b90b2dac012..5d9c930d137 100644
--- a/cpp/include/cudf/table/table_view.hpp
+++ b/cpp/include/cudf/table/table_view.hpp
@@ -336,6 +336,23 @@ inline bool has_nested_nulls(table_view const& input)
   });
 }
 
+/**
+ * @brief Returns True if the table has a nullable column at any level of the column hierarchy
+ *
+ * @param input The table to check for nullable columns
+ * @return True if the table has nullable columns at any level of the column hierarchy, false
+ * otherwise
+ */
+inline bool has_nested_nullable_columns(table_view const& input)
+{
+  return std::any_of(input.begin(), input.end(), [](auto const& col) {
+    return col.nullable() ||
+           std::any_of(col.child_begin(), col.child_end(), [](auto const& child_col) {
+             return has_nested_nullable_columns(table_view{{child_col}});
+           });
+  });
+}
+
 /**
  * @brief The function to collect all nullable columns at all nested levels in a given table.
  *
diff --git a/cpp/include/cudf/unary.hpp b/cpp/include/cudf/unary.hpp
index 1130c41afe5..64e802d88dd 100644
--- a/cpp/include/cudf/unary.hpp
+++ b/cpp/include/cudf/unary.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <cudf/types.hpp>
+#include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
 
@@ -65,6 +66,7 @@ enum class unary_operator : int32_t {
  *
  * @param input A `column_view` as input
  * @param op operation to perform
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  *
  * @returns Column of same size as `input` containing result of the operation
@@ -72,6 +74,7 @@ enum class unary_operator : int32_t {
 std::unique_ptr<cudf::column> unary_operation(
   cudf::column_view const& input,
   cudf::unary_operator op,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -79,6 +82,7 @@ std::unique_ptr<cudf::column> unary_operation(
  * indicates the value is null and `false` indicates the value is valid.
  *
  * @param input A `column_view` as input
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  *
  * @returns A non-nullable column of `type_id::BOOL8` elements with `true`
@@ -86,6 +90,7 @@ std::unique_ptr<cudf::column> unary_operation(
  */
 std::unique_ptr<cudf::column> is_null(
   cudf::column_view const& input,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -93,6 +98,7 @@ std::unique_ptr<cudf::column> is_null(
  * indicates the value is valid and `false` indicates the value is null.
  *
  * @param input A `column_view` as input
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  *
  * @returns A non-nullable column of `type_id::BOOL8` elements with `false`
@@ -100,6 +106,7 @@ std::unique_ptr<cudf::column> is_null(
  */
 std::unique_ptr<cudf::column> is_valid(
   cudf::column_view const& input,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -109,6 +116,7 @@ std::unique_ptr<cudf::column> is_valid(
  *
  * @param input Input column
  * @param out_type Desired datatype of output column
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  *
  * @returns Column of same size as `input` containing result of the cast operation
@@ -117,6 +125,7 @@ std::unique_ptr<cudf::column> is_valid(
 std::unique_ptr<column> cast(
   column_view const& input,
   data_type out_type,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -127,12 +136,14 @@ std::unique_ptr<column> cast(
  * @throws cudf::logic_error if `input` is a non-floating point type
  *
  * @param input A column of floating-point elements
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  *
  * @returns A non-nullable column of `type_id::BOOL8` elements with `true` representing `NAN` values
  */
 std::unique_ptr<column> is_nan(
   cudf::column_view const& input,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -143,6 +154,7 @@ std::unique_ptr<column> is_nan(
  * @throws cudf::logic_error if `input` is a non-floating point type
  *
  * @param input A column of floating-point elements
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  *
  * @returns A non-nullable column of `type_id::BOOL8` elements with `false` representing `NAN`
@@ -150,6 +162,7 @@ std::unique_ptr<column> is_nan(
  */
 std::unique_ptr<column> is_not_nan(
   cudf::column_view const& input,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/include/cudf/utilities/traits.hpp b/cpp/include/cudf/utilities/traits.hpp
index 51f5d9d571a..2dda0740b96 100644
--- a/cpp/include/cudf/utilities/traits.hpp
+++ b/cpp/include/cudf/utilities/traits.hpp
@@ -279,6 +279,30 @@ constexpr inline bool is_integral()
  */
 bool is_integral(data_type type);
 
+/**
+ * @brief Indicates whether the type `T` is an integral type but not bool type.
+ *
+ * @tparam T  The type to verify
+ * @return true `T` is integral but not bool
+ * @return false  `T` is not integral or is bool
+ */
+template <typename T>
+constexpr inline bool is_integral_not_bool()
+{
+  return cuda::std::is_integral_v<T> and not std::is_same_v<T, bool>;
+}
+
+/**
+ * @brief Indicates whether `type` is a integral `data_type` and not BOOL8
+ *
+ * "Integral" types are fundamental integer types such as `INT*` and `UINT*`.
+ *
+ * @param type The `data_type` to verify
+ * @return true `type` is integral but not bool
+ * @return false `type` is integral or is bool
+ */
+bool is_integral_not_bool(data_type type);
+
 /**
  * @brief Indicates whether the type `T` is a floating point type.
  *
diff --git a/cpp/include/cudf_test/column_utilities.hpp b/cpp/include/cudf_test/column_utilities.hpp
index 059bd10eae1..f6872fcdd6d 100644
--- a/cpp/include/cudf_test/column_utilities.hpp
+++ b/cpp/include/cudf_test/column_utilities.hpp
@@ -140,39 +140,6 @@ void expect_equal_buffers(void const* lhs, void const* rhs, std::size_t size_byt
  */
 void expect_column_empty(cudf::column_view const& col);
 
-/**
- * @brief Formats a column view as a string
- *
- * @param col The column view
- * @param delimiter The delimiter to put between strings
- */
-std::string to_string(cudf::column_view const& col, std::string const& delimiter);
-
-/**
- * @brief Formats a null mask as a string
- *
- * @param null_mask The null mask buffer
- * @param null_mask_size Size of the null mask (in rows)
- */
-std::string to_string(std::vector<bitmask_type> const& null_mask, size_type null_mask_size);
-
-/**
- * @brief Convert column values to a host vector of strings
- *
- * @param col The column view
- */
-std::vector<std::string> to_strings(cudf::column_view const& col);
-
-/**
- * @brief Print a column view to an ostream
- *
- * @param os        The output stream
- * @param col       The column view
- */
-void print(cudf::column_view const& col,
-           std::ostream& os             = std::cout,
-           std::string const& delimiter = ",");
-
 /**
  * @brief Copy the null bitmask from a column view to a host vector
  *
diff --git a/cpp/include/cudf_test/column_wrapper.hpp b/cpp/include/cudf_test/column_wrapper.hpp
index c0932b81dc3..b9f2e0d9868 100644
--- a/cpp/include/cudf_test/column_wrapper.hpp
+++ b/cpp/include/cudf_test/column_wrapper.hpp
@@ -21,6 +21,7 @@
 #include <cudf/copying.hpp>
 #include <cudf/detail/concatenate.hpp>
 #include <cudf/detail/iterator.cuh>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/dictionary/encode.hpp>
 #include <cudf/fixed_point/fixed_point.hpp>
@@ -802,7 +803,8 @@ class strings_column_wrapper : public detail::column_wrapper {
       offsets, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
     auto d_bitmask = cudf::detail::make_device_uvector_sync(
       null_mask, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
-    wrapped = cudf::make_strings_column(d_chars, d_offsets, d_bitmask, null_count);
+    wrapped = cudf::make_strings_column(
+      d_chars, d_offsets, d_bitmask, null_count, cudf::test::get_default_stream());
   }
 
   /**
@@ -1281,6 +1283,11 @@ class dictionary_column_wrapper<std::string> : public detail::column_wrapper {
 template <typename T, typename SourceElementT = T>
 class lists_column_wrapper : public detail::column_wrapper {
  public:
+  /**
+   * @brief Cast to lists_column_view
+   */
+  operator lists_column_view() const { return cudf::lists_column_view{wrapped->view()}; }
+
   /**
    * @brief Construct a lists column containing a single list of fixed-width
    * type from an initializer list of values.
@@ -1542,8 +1549,12 @@ class lists_column_wrapper : public detail::column_wrapper {
                        rmm::device_buffer&& null_mask)
   {
     // construct the list column
-    wrapped = make_lists_column(
-      num_rows, std::move(offsets), std::move(values), null_count, std::move(null_mask));
+    wrapped = make_lists_column(num_rows,
+                                std::move(offsets),
+                                std::move(values),
+                                null_count,
+                                std::move(null_mask),
+                                cudf::test::get_default_stream());
   }
 
   /**
@@ -1618,8 +1629,12 @@ class lists_column_wrapper : public detail::column_wrapper {
     }();
 
     // construct the list column
-    wrapped = make_lists_column(
-      cols.size(), std::move(offsets), std::move(data), null_count, std::move(null_mask));
+    wrapped = make_lists_column(cols.size(),
+                                std::move(offsets),
+                                std::move(data),
+                                null_count,
+                                std::move(null_mask),
+                                cudf::test::get_default_stream());
   }
 
   /**
@@ -1647,8 +1662,12 @@ class lists_column_wrapper : public detail::column_wrapper {
     depth = 0;
 
     size_type num_elements = offsets->size() == 0 ? 0 : offsets->size() - 1;
-    wrapped =
-      make_lists_column(num_elements, std::move(offsets), std::move(c), 0, rmm::device_buffer{});
+    wrapped                = make_lists_column(num_elements,
+                                std::move(offsets),
+                                std::move(c),
+                                0,
+                                rmm::device_buffer{},
+                                cudf::test::get_default_stream());
   }
 
   /**
@@ -1697,12 +1716,15 @@ class lists_column_wrapper : public detail::column_wrapper {
     }
 
     lists_column_view lcv(col);
-    return make_lists_column(col.size(),
-                             std::make_unique<column>(lcv.offsets()),
-                             normalize_column(lists_column_view(col).child(),
-                                              lists_column_view(expected_hierarchy).child()),
-                             col.null_count(),
-                             copy_bitmask(col));
+    return make_lists_column(
+      col.size(),
+      std::make_unique<column>(lcv.offsets()),
+      normalize_column(lists_column_view(col).child(),
+                       lists_column_view(expected_hierarchy).child()),
+      col.null_count(),
+      cudf::detail::copy_bitmask(
+        col, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()),
+      cudf::test::get_default_stream());
   }
 
   std::pair<std::vector<column_view>, std::vector<std::unique_ptr<column>>> preprocess_columns(
@@ -1825,7 +1847,8 @@ class structs_column_wrapper : public detail::column_wrapper {
                    child_column_wrappers.end(),
                    std::back_inserter(child_columns),
                    [&](auto const& column_wrapper) {
-                     return std::make_unique<cudf::column>(column_wrapper.get());
+                     return std::make_unique<cudf::column>(column_wrapper.get(),
+                                                           cudf::test::get_default_stream());
                    });
     init(std::move(child_columns), validity);
   }
@@ -1861,7 +1884,8 @@ class structs_column_wrapper : public detail::column_wrapper {
                    child_column_wrappers.end(),
                    std::back_inserter(child_columns),
                    [&](auto const& column_wrapper) {
-                     return std::make_unique<cudf::column>(column_wrapper.get());
+                     return std::make_unique<cudf::column>(column_wrapper.get(),
+                                                           cudf::test::get_default_stream());
                    });
     init(std::move(child_columns), validity_iter);
   }
@@ -1885,8 +1909,11 @@ class structs_column_wrapper : public detail::column_wrapper {
       return cudf::test::detail::make_null_mask(validity.begin(), validity.end());
     }();
 
-    wrapped = cudf::make_structs_column(
-      num_rows, std::move(child_columns), null_count, std::move(null_mask));
+    wrapped = cudf::make_structs_column(num_rows,
+                                        std::move(child_columns),
+                                        null_count,
+                                        std::move(null_mask),
+                                        cudf::test::get_default_stream());
   }
 
   template <typename V>
diff --git a/cpp/include/cudf_test/debug_utilities.hpp b/cpp/include/cudf_test/debug_utilities.hpp
new file mode 100644
index 00000000000..a0881490b82
--- /dev/null
+++ b/cpp/include/cudf_test/debug_utilities.hpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/null_mask.hpp>
+
+namespace cudf::test {
+
+/**
+ * @brief Formats a column view as a string
+ *
+ * @param col The input column view
+ * @param delimiter The delimiter to put between strings
+ */
+std::string to_string(cudf::column_view const& col, std::string const& delimiter);
+
+/**
+ * @brief Convert column values to a host vector of strings
+ *
+ * @param col The input column view
+ */
+std::vector<std::string> to_strings(cudf::column_view const& col);
+
+/**
+ * @brief Print a column view to an ostream
+ *
+ * @param col The input column view
+ * @param os The output stream
+ */
+void print(cudf::column_view const& col, std::ostream& os = std::cout);
+
+}  // namespace cudf::test
diff --git a/cpp/include/cudf_test/detail/column_utilities.hpp b/cpp/include/cudf_test/detail/column_utilities.hpp
deleted file mode 100644
index f8270f61f10..00000000000
--- a/cpp/include/cudf_test/detail/column_utilities.hpp
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <cudf/column/column_view.hpp>
-#include <cudf/null_mask.hpp>
-
-namespace cudf {
-namespace test {
-namespace detail {
-
-/**
- * @brief Formats a column view as a string
- *
- * @param col The column view
- * @param delimiter The delimiter to put between strings
- * @param indent Indentation for all output. See detail::to_strings for detailed
- * explanation.
- */
-std::string to_string(cudf::column_view const& col,
-                      std::string const& delimiter,
-                      std::string const& indent = "");
-
-/**
- * @brief Formats a null mask as a string
- *
- * @param null_mask The null mask buffer
- * @param null_mask_size Size of the null mask (in rows)
- * @param indent Indentation for all output. See detail::to_strings for detailed
- * explanation.
- */
-std::string to_string(std::vector<bitmask_type> const& null_mask,
-                      size_type null_mask_size,
-                      std::string const& indent = "");
-
-/**
- * @brief Convert column values to a host vector of strings
- *
- * Supports indentation of all output.  For example, if the displayed output of your column
- * would be
- *
- * @code{.pseudo}
- * "1,2,3,4,5"
- * @endcode
- * and the `indent` parameter was "   ", that indentation would be prepended to
- * result in the output
- * @code{.pseudo}
- * "   1,2,3,4,5"
- * @endcode
- *
- * The can be useful for displaying complex types. An example use case would be for
- * displaying the nesting of a LIST type column (via recursion).
- *
- *  List<List<int>>:
- *  Length : 3
- *  Offsets : 0, 2, 5, 6
- *  Children :
- *     List<int>:
- *     Length : 6
- *     Offsets : 0, 2, 4, 7, 8, 9, 11
- *     Children :
- *        1, 2, 3, 4, 5, 6, 7, 0, 8, 9, 10
- *
- * @param col The column view
- * @param indent Indentation for all output
- */
-std::vector<std::string> to_strings(cudf::column_view const& col, std::string const& indent = "");
-
-}  // namespace detail
-}  // namespace test
-}  // namespace cudf
diff --git a/cpp/include/doxygen_groups.h b/cpp/include/doxygen_groups.h
index 4da2807bbe6..8845b84613d 100644
--- a/cpp/include/doxygen_groups.h
+++ b/cpp/include/doxygen_groups.h
@@ -130,7 +130,6 @@
  *   @defgroup strings_replace Replacing
  *   @defgroup strings_split Splitting
  *   @defgroup strings_extract Extracting
- *   @defgroup strings_json JSON
  *   @defgroup strings_regex Regex
  * @}
  * @defgroup dictionary_apis Dictionary
@@ -146,6 +145,10 @@
  *   @defgroup io_datasources Data Sources
  *   @defgroup io_datasinks Data Sinks
  * @}
+ * @defgroup json_apis JSON
+ * @{
+ *   @defgroup json_object JSON Path
+ * @}
  * @defgroup lists_apis Lists
  * @{
  *   @defgroup lists_combine Combining
diff --git a/cpp/include/nvtext/bpe_tokenize.hpp b/cpp/include/nvtext/byte_pair_encoding.hpp
similarity index 72%
rename from cpp/include/nvtext/bpe_tokenize.hpp
rename to cpp/include/nvtext/byte_pair_encoding.hpp
index c67f4bd8b1c..632a3cc279f 100644
--- a/cpp/include/nvtext/bpe_tokenize.hpp
+++ b/cpp/include/nvtext/byte_pair_encoding.hpp
@@ -32,11 +32,11 @@ namespace nvtext {
 /**
  * @brief The table of merge pairs for the BPE encoder.
  *
- * To create an instance, call @ref nvtext::load_merge_pairs_file
+ * To create an instance, call @ref nvtext::load_merge_pairs
  */
 struct bpe_merge_pairs {
   struct bpe_merge_pairs_impl;
-  std::unique_ptr<bpe_merge_pairs_impl> impl{};  ///< Implementation of the BPE merge pairs table.
+  bpe_merge_pairs_impl* impl{};  ///< Implementation of the BPE merge pairs table.
 
   /**
    * @brief Construct a new bpe merge pairs object
@@ -61,11 +61,14 @@ struct bpe_merge_pairs {
                   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   ~bpe_merge_pairs();
+  bpe_merge_pairs();
 };
 
 /**
  * @brief Create a nvtext::bpe_merge_pairs from an input file.
  *
+ * @deprecated Since 23.12
+ *
  * The file should contain a pair of strings per line separated by
  * a single space.
  *
@@ -94,10 +97,40 @@ struct bpe_merge_pairs {
  * @param mr Memory resource to allocate any returned objects.
  * @return A nvtext::bpe_merge_pairs object
  */
-std::unique_ptr<bpe_merge_pairs> load_merge_pairs_file(
+[[deprecated]] std::unique_ptr<bpe_merge_pairs> load_merge_pairs_file(
   std::string const& filename_merges,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Create a nvtext::bpe_merge_pairs from a strings column
+ *
+ * The input column should contain a unique pair of strings per line separated by
+ * a single space. An incorrect format or non-unique entries will result in
+ * undefined behavior.
+ *
+ * Example:
+ * @code{.pseudo}
+ * merge_pairs = ["e n", "i t", "i s", "e s", "en t", "c e", "es t", "en ce", "t est", "s ent"]
+ * mps = load_merge_pairs(merge_pairs)
+ * // the mps object can be passed to the byte_pair_encoding API
+ * @endcode
+ *
+ * The pairs are expected to be ordered in the file by their rank
+ * relative to each other. A pair earlier in the file has priority over
+ * any pairs below it.
+ *
+ * @throw cudf::logic_error if `merge_pairs` is empty or contains nulls
+ *
+ * @param merge_pairs Column containing the unique merge pairs
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Memory resource to allocate any returned objects
+ * @return A nvtext::bpe_merge_pairs object
+ */
+std::unique_ptr<bpe_merge_pairs> load_merge_pairs(
+  cudf::strings_column_view const& merge_pairs,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /**
  * @brief Byte pair encode the input strings.
  *
@@ -110,7 +143,8 @@ std::unique_ptr<bpe_merge_pairs> load_merge_pairs_file(
  * pairs before the result is joined to make the output string.
  *
  * @code{.pseudo}
- * mps = load_merges_file("merges.txt") // see doxygen for example contents
+ * merge_pairs = ["e n", "i t", "i s", "e s", "en t", "c e", "es t", "en ce", "t est", "s ent"]
+ * mps = load_merge_pairs(merge_pairs)
  * input = ["test sentence", "thisis test"]
  * result = byte_pair_encoding(input, mps)
  * result is now ["test sent ence", "this is test"]
@@ -120,7 +154,7 @@ std::unique_ptr<bpe_merge_pairs> load_merge_pairs_file(
  * @throw cudf::logic_error if `separator` is invalid
  *
  * @param input Strings to encode.
- * @param merges_pairs Created by a call to @ref nvtext::load_merge_pairs_file.
+ * @param merges_pairs Created by a call to @ref nvtext::load_merge_pairs.
  * @param separator String used to build the output after encoding.
  *                  Default is a space.
  * @param mr Memory resource to allocate any returned objects.
diff --git a/cpp/include/nvtext/normalize.hpp b/cpp/include/nvtext/normalize.hpp
index 1be25b4f1f4..3cbff5c744b 100644
--- a/cpp/include/nvtext/normalize.hpp
+++ b/cpp/include/nvtext/normalize.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -44,12 +44,14 @@ namespace nvtext {
  * A null input element at row `i` produces a corresponding null entry
  * for row `i` in the output column.
  *
- * @param strings Strings column to normalize.
- * @param mr Device memory resource used to allocate the returned column's device memory.
+ * @param input Strings column to normalize
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @return New strings columns of normalized strings.
  */
 std::unique_ptr<cudf::column> normalize_spaces(
-  cudf::strings_column_view const& strings,
+  cudf::strings_column_view const& input,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -89,16 +91,18 @@ std::unique_ptr<cudf::column> normalize_spaces(
  * This function requires about 16x the number of character bytes in the input
  * strings column as working memory.
  *
- * @param strings The input strings to normalize.
+ * @param input The input strings to normalize
  * @param do_lower_case If true, upper-case characters are converted to
  *        lower-case and accents are stripped from those characters.
  *        If false, accented and upper-case characters are not transformed.
- * @param mr Memory resource to allocate any returned objects.
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Memory resource to allocate any returned objects
  * @return Normalized strings column
  */
 std::unique_ptr<cudf::column> normalize_characters(
-  cudf::strings_column_view const& strings,
+  cudf::strings_column_view const& input,
   bool do_lower_case,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/include/nvtext/replace.hpp b/cpp/include/nvtext/replace.hpp
index 0dde7f195b9..88cf7d41901 100644
--- a/cpp/include/nvtext/replace.hpp
+++ b/cpp/include/nvtext/replace.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -73,19 +73,21 @@ namespace nvtext {
  * @throw cudf::logic_error if targets or replacements contain nulls
  * @throw cudf::logic_error if delimiter is invalid
  *
- * @param strings Strings column to replace.
- * @param targets Strings to compare against tokens found in `strings`
+ * @param input Strings column to replace
+ * @param targets Strings to compare against tokens found in `input`
  * @param replacements Replacement strings for each string in `targets`
  * @param delimiter Characters used to separate each string into tokens.
  *                  The default of empty string will identify tokens using whitespace.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New strings columns of with replaced strings.
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings columns of with replaced strings
  */
 std::unique_ptr<cudf::column> replace_tokens(
-  cudf::strings_column_view const& strings,
+  cudf::strings_column_view const& input,
   cudf::strings_column_view const& targets,
   cudf::strings_column_view const& replacements,
   cudf::string_scalar const& delimiter = cudf::string_scalar{""},
+  rmm::cuda_stream_view stream         = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr  = rmm::mr::get_current_device_resource());
 
 /**
@@ -120,19 +122,21 @@ std::unique_ptr<cudf::column> replace_tokens(
  *
  * @throw cudf::logic_error if `delimiter` or `replacement` is invalid
  *
- * @param strings Strings column to replace.
- * @param min_token_length The minimum number of characters to retain a token in the output string.
- * @param replacement Optional replacement string to be used in place of removed tokens.
+ * @param input Strings column to replace
+ * @param min_token_length The minimum number of characters to retain a token in the output string
+ * @param replacement Optional replacement string to be used in place of removed tokens
  * @param delimiter Characters used to separate each string into tokens.
  *                  The default of empty string will identify tokens using whitespace.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New strings columns of with replaced strings.
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings columns of with replaced strings
  */
 std::unique_ptr<cudf::column> filter_tokens(
-  cudf::strings_column_view const& strings,
+  cudf::strings_column_view const& input,
   cudf::size_type min_token_length,
   cudf::string_scalar const& replacement = cudf::string_scalar{""},
   cudf::string_scalar const& delimiter   = cudf::string_scalar{""},
+  rmm::cuda_stream_view stream           = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr    = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/include/nvtext/tokenize.hpp b/cpp/include/nvtext/tokenize.hpp
index 44f8f44557c..107fefcc3bf 100644
--- a/cpp/include/nvtext/tokenize.hpp
+++ b/cpp/include/nvtext/tokenize.hpp
@@ -49,15 +49,17 @@ namespace nvtext {
  *
  * All null row entries are ignored and the output contains all valid rows.
  *
- * @param strings Strings column tokenize.
+ * @param input Strings column to tokenize
  * @param delimiter UTF-8 characters used to separate each string into tokens.
  *                  The default of empty string will separate tokens using whitespace.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New strings columns of tokens.
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings columns of tokens
  */
 std::unique_ptr<cudf::column> tokenize(
-  cudf::strings_column_view const& strings,
+  cudf::strings_column_view const& input,
   cudf::string_scalar const& delimiter = cudf::string_scalar{""},
+  rmm::cuda_stream_view stream         = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr  = rmm::mr::get_current_device_resource());
 
 /**
@@ -84,14 +86,16 @@ std::unique_ptr<cudf::column> tokenize(
  *
  * @throw cudf::logic_error if the delimiters column is empty or contains nulls.
  *
- * @param strings Strings column to tokenize.
- * @param delimiters Strings used to separate individual strings into tokens.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New strings columns of tokens.
+ * @param input Strings column to tokenize
+ * @param delimiters Strings used to separate individual strings into tokens
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings columns of tokens
  */
 std::unique_ptr<cudf::column> tokenize(
-  cudf::strings_column_view const& strings,
+  cudf::strings_column_view const& input,
   cudf::strings_column_view const& delimiters,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -112,15 +116,17 @@ std::unique_ptr<cudf::column> tokenize(
  * All null row entries are ignored and the output contains all valid rows.
  * The number of tokens for a null element is set to 0 in the output column.
  *
- * @param strings Strings column to use for this operation
- * @param delimiter Strings used to separate each string into tokens;
+ * @param input Strings column to count tokens
+ * @param delimiter Strings used to separate each string into tokens.
  *                  The default of empty string will separate tokens using whitespace.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return New column of token counts
  */
 std::unique_ptr<cudf::column> count_tokens(
-  cudf::strings_column_view const& strings,
+  cudf::strings_column_view const& input,
   cudf::string_scalar const& delimiter = cudf::string_scalar{""},
+  rmm::cuda_stream_view stream         = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr  = rmm::mr::get_current_device_resource());
 
 /**
@@ -143,14 +149,16 @@ std::unique_ptr<cudf::column> count_tokens(
  *
  * @throw cudf::logic_error if the delimiters column is empty or contains nulls
  *
- * @param strings Strings column to use for this operation
+ * @param input Strings column to count tokens
  * @param delimiters Strings used to separate each string into tokens
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return New column of token counts
  */
 std::unique_ptr<cudf::column> count_tokens(
-  cudf::strings_column_view const& strings,
+  cudf::strings_column_view const& input,
   cudf::strings_column_view const& delimiters,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -168,12 +176,14 @@ std::unique_ptr<cudf::column> count_tokens(
  *
  * All null row entries are ignored and the output contains all valid rows.
  *
- * @param strings Strings column to tokenize.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New strings columns of tokens.
+ * @param input Strings column to tokenize
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings columns of tokens
  */
 std::unique_ptr<cudf::column> character_tokenize(
-  cudf::strings_column_view const& strings,
+  cudf::strings_column_view const& input,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -203,16 +213,18 @@ std::unique_ptr<cudf::column> character_tokenize(
  * @throw cudf::logic_error if `row_indices.size() != strings.size()`
  * @throw cudf::logic_error if `row_indices` contains nulls
  *
- * @param strings Strings column to detokenize.
- * @param row_indices The relative output row index assigned for each token in the input column.
- * @param separator String to append after concatenating each token to the proper output row.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New strings columns of tokens.
+ * @param input Strings column to detokenize
+ * @param row_indices The relative output row index assigned for each token in the input column
+ * @param separator String to append after concatenating each token to the proper output row
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings columns of tokens
  */
 std::unique_ptr<cudf::column> detokenize(
-  cudf::strings_column_view const& strings,
+  cudf::strings_column_view const& input,
   cudf::column_view const& row_indices,
   cudf::string_scalar const& separator = cudf::string_scalar(" "),
+  rmm::cuda_stream_view stream         = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr  = rmm::mr::get_current_device_resource());
 
 /**
diff --git a/cpp/libcudf_kafka/CMakeLists.txt b/cpp/libcudf_kafka/CMakeLists.txt
index 33bd04fffb3..4128afa3935 100644
--- a/cpp/libcudf_kafka/CMakeLists.txt
+++ b/cpp/libcudf_kafka/CMakeLists.txt
@@ -21,8 +21,8 @@ include(rapids-export)
 include(rapids-find)
 
 project(
-  CUDA_KAFKA
-  VERSION 23.10.00
+  CUDF_KAFKA
+  VERSION 23.12.00
   LANGUAGES CXX
 )
 
@@ -64,7 +64,7 @@ add_library(cudf_kafka SHARED src/kafka_consumer.cpp src/kafka_callback.cpp)
 # ##################################################################################################
 # * include paths ---------------------------------------------------------------------------------
 target_include_directories(
-  cudf_kafka PUBLIC "$<BUILD_INTERFACE:${CUDA_KAFKA_SOURCE_DIR}/include>"
+  cudf_kafka PUBLIC "$<BUILD_INTERFACE:${CUDF_KAFKA_SOURCE_DIR}/include>"
                     "$<INSTALL_INTERFACE:include>"
 )
 
@@ -85,6 +85,8 @@ set_target_properties(
              CXX_STANDARD_REQUIRED ON
 )
 
+add_library(cudf_kafka::cudf_kafka ALIAS cudf_kafka)
+
 # ##################################################################################################
 # * cudf_kafka Install ----------------------------------------------------------------------------
 rapids_cmake_install_lib_dir(lib_dir)
@@ -94,7 +96,7 @@ install(
   EXPORT cudf_kafka-exports
 )
 
-install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/include DESTINATION include)
+install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/include/ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
 
 rapids_export(
   INSTALL cudf_kafka
diff --git a/cpp/libcudf_kafka/cmake/thirdparty/get_cudf.cmake b/cpp/libcudf_kafka/cmake/thirdparty/get_cudf.cmake
index aa4c5b60e7a..20aa9873f43 100644
--- a/cpp/libcudf_kafka/cmake/thirdparty/get_cudf.cmake
+++ b/cpp/libcudf_kafka/cmake/thirdparty/get_cudf.cmake
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -35,21 +35,21 @@ function(find_and_configure_cudf VERSION)
   endif()
 endfunction()
 
-set(CUDA_KAFKA_MIN_VERSION_cudf
-    "${CUDA_KAFKA_VERSION_MAJOR}.${CUDA_KAFKA_VERSION_MINOR}.${CUDA_KAFKA_VERSION_PATCH}"
+set(CUDF_KAFKA_MIN_VERSION
+    "${CUDF_KAFKA_VERSION_MAJOR}.${CUDF_KAFKA_VERSION_MINOR}.${CUDF_KAFKA_VERSION_PATCH}"
 )
-find_and_configure_cudf(${CUDA_KAFKA_MIN_VERSION_cudf})
+find_and_configure_cudf(${CUDF_KAFKA_MIN_VERSION})
 
 if(cudf_REQUIRES_CUDA)
-  rapids_cuda_init_architectures(CUDA_KAFKA)
+  rapids_cuda_init_architectures(CUDF_KAFKA)
 
   # Since we are building cudf as part of ourselves we need to enable the CUDA language in the
   # top-most scope
   enable_language(CUDA)
 
-  # Since CUDA_KAFKA only enables CUDA optionally we need to manually include the file that
+  # Since CUDF_KAFKA only enables CUDA optionally we need to manually include the file that
   # rapids_cuda_init_architectures relies on `project` calling
-  if(DEFINED CMAKE_PROJECT_CUDA_KAFKA_INCLUDE)
-    include("${CMAKE_PROJECT_CUDA_KAFKA_INCLUDE}")
+  if(DEFINED CMAKE_PROJECT_CUDF_KAFKA_INCLUDE)
+    include("${CMAKE_PROJECT_CUDF_KAFKA_INCLUDE}")
   endif()
 endif()
diff --git a/cpp/libcudf_kafka/tests/CMakeLists.txt b/cpp/libcudf_kafka/tests/CMakeLists.txt
index 68a5327b455..b819cb6fc3b 100644
--- a/cpp/libcudf_kafka/tests/CMakeLists.txt
+++ b/cpp/libcudf_kafka/tests/CMakeLists.txt
@@ -26,7 +26,7 @@ function(ConfigureTest test_name)
   add_executable(${test_name} ${ARGN})
   set_target_properties(
     ${test_name}
-    PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$<BUILD_INTERFACE:${CUDA_KAFKA_BINARY_DIR}/gtests>"
+    PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$<BUILD_INTERFACE:${CUDF_KAFKA_BINARY_DIR}/gtests>"
                INSTALL_RPATH "\$ORIGIN/../../../lib"
   )
   target_link_libraries(
diff --git a/cpp/src/binaryop/binaryop.cpp b/cpp/src/binaryop/binaryop.cpp
index ef07de8c461..53b04c4ca80 100644
--- a/cpp/src/binaryop/binaryop.cpp
+++ b/cpp/src/binaryop/binaryop.cpp
@@ -366,7 +366,7 @@ std::unique_ptr<column> binary_operation(column_view const& lhs,
 
   CUDF_EXPECTS((lhs.size() == rhs.size()), "Column sizes don't match");
 
-  auto [new_mask, null_count] = bitmask_and(table_view({lhs, rhs}), stream, mr);
+  auto [new_mask, null_count] = cudf::detail::bitmask_and(table_view({lhs, rhs}), stream, mr);
   auto out =
     make_fixed_width_column(output_type, lhs.size(), std::move(new_mask), null_count, stream, mr);
 
@@ -405,38 +405,42 @@ std::unique_ptr<column> binary_operation(scalar const& lhs,
                                          column_view const& rhs,
                                          binary_operator op,
                                          data_type output_type,
+                                         rmm::cuda_stream_view stream,
                                          rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::binary_operation(lhs, rhs, op, output_type, cudf::get_default_stream(), mr);
+  return detail::binary_operation(lhs, rhs, op, output_type, stream, mr);
 }
 std::unique_ptr<column> binary_operation(column_view const& lhs,
                                          scalar const& rhs,
                                          binary_operator op,
                                          data_type output_type,
+                                         rmm::cuda_stream_view stream,
                                          rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::binary_operation(lhs, rhs, op, output_type, cudf::get_default_stream(), mr);
+  return detail::binary_operation(lhs, rhs, op, output_type, stream, mr);
 }
 std::unique_ptr<column> binary_operation(column_view const& lhs,
                                          column_view const& rhs,
                                          binary_operator op,
                                          data_type output_type,
+                                         rmm::cuda_stream_view stream,
                                          rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::binary_operation(lhs, rhs, op, output_type, cudf::get_default_stream(), mr);
+  return detail::binary_operation(lhs, rhs, op, output_type, stream, mr);
 }
 
 std::unique_ptr<column> binary_operation(column_view const& lhs,
                                          column_view const& rhs,
                                          std::string const& ptx,
                                          data_type output_type,
+                                         rmm::cuda_stream_view stream,
                                          rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::binary_operation(lhs, rhs, ptx, output_type, cudf::get_default_stream(), mr);
+  return detail::binary_operation(lhs, rhs, ptx, output_type, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/binaryop/compiled/binary_ops.cu b/cpp/src/binaryop/compiled/binary_ops.cu
index 1f7f342632a..85ab5c6d6cb 100644
--- a/cpp/src/binaryop/compiled/binary_ops.cu
+++ b/cpp/src/binaryop/compiled/binary_ops.cu
@@ -47,14 +47,16 @@ namespace {
 struct scalar_as_column_view {
   using return_type = typename std::pair<column_view, std::unique_ptr<column>>;
   template <typename T, CUDF_ENABLE_IF(is_fixed_width<T>())>
-  return_type operator()(scalar const& s, rmm::cuda_stream_view, rmm::mr::device_memory_resource*)
+  return_type operator()(scalar const& s,
+                         rmm::cuda_stream_view stream,
+                         rmm::mr::device_memory_resource*)
   {
     auto& h_scalar_type_view = static_cast<cudf::scalar_type_t<T>&>(const_cast<scalar&>(s));
     auto col_v               = column_view(s.type(),
                              1,
                              h_scalar_type_view.data(),
                              reinterpret_cast<bitmask_type const*>(s.validity_data()),
-                             !s.is_valid());
+                             !s.is_valid(stream));
     return std::pair{col_v, std::unique_ptr<column>(nullptr)};
   }
   template <typename T, CUDF_ENABLE_IF(!is_fixed_width<T>())>
diff --git a/cpp/src/bitmask/null_mask.cu b/cpp/src/bitmask/null_mask.cu
index 5a0d3e4f120..1a1cbb17d15 100644
--- a/cpp/src/bitmask/null_mask.cu
+++ b/cpp/src/bitmask/null_mask.cu
@@ -157,16 +157,21 @@ void set_null_mask(bitmask_type* bitmask,
 // Create a device_buffer for a null mask
 rmm::device_buffer create_null_mask(size_type size,
                                     mask_state state,
+                                    rmm::cuda_stream_view stream,
                                     rmm::mr::device_memory_resource* mr)
 {
-  return detail::create_null_mask(size, state, cudf::get_default_stream(), mr);
+  return detail::create_null_mask(size, state, stream, mr);
 }
 
 // Set pre-allocated null mask of given bit range [begin_bit, end_bit) to valid, if valid==true,
 // or null, otherwise;
-void set_null_mask(bitmask_type* bitmask, size_type begin_bit, size_type end_bit, bool valid)
+void set_null_mask(bitmask_type* bitmask,
+                   size_type begin_bit,
+                   size_type end_bit,
+                   bool valid,
+                   rmm::cuda_stream_view stream)
 {
-  return detail::set_null_mask(bitmask, begin_bit, end_bit, valid, cudf::get_default_stream());
+  return detail::set_null_mask(bitmask, begin_bit, end_bit, valid, stream);
 }
 
 namespace detail {
@@ -505,39 +510,67 @@ std::pair<rmm::device_buffer, size_type> bitmask_or(table_view const& view,
   return std::pair(std::move(null_mask), 0);
 }
 
+void set_all_valid_null_masks(column_view const& input,
+                              column& output,
+                              rmm::cuda_stream_view stream,
+                              rmm::mr::device_memory_resource* mr)
+{
+  if (input.nullable()) {
+    auto mask = detail::create_null_mask(output.size(), mask_state::ALL_VALID, stream, mr);
+    output.set_null_mask(std::move(mask), 0);
+
+    for (size_type i = 0; i < input.num_children(); ++i) {
+      set_all_valid_null_masks(input.child(i), output.child(i), stream, mr);
+    }
+  }
+}
+
 }  // namespace detail
 
 // Create a bitmask from a specific range
 rmm::device_buffer copy_bitmask(bitmask_type const* mask,
                                 size_type begin_bit,
                                 size_type end_bit,
+                                rmm::cuda_stream_view stream,
                                 rmm::mr::device_memory_resource* mr)
 {
-  return detail::copy_bitmask(mask, begin_bit, end_bit, cudf::get_default_stream(), mr);
+  CUDF_FUNC_RANGE();
+  return detail::copy_bitmask(mask, begin_bit, end_bit, stream, mr);
 }
 
 // Create a bitmask from a column view
-rmm::device_buffer copy_bitmask(column_view const& view, rmm::mr::device_memory_resource* mr)
+rmm::device_buffer copy_bitmask(column_view const& view,
+                                rmm::cuda_stream_view stream,
+                                rmm::mr::device_memory_resource* mr)
 {
-  return detail::copy_bitmask(view, cudf::get_default_stream(), mr);
+  CUDF_FUNC_RANGE();
+  return detail::copy_bitmask(view, stream, mr);
 }
 
 std::pair<rmm::device_buffer, size_type> bitmask_and(table_view const& view,
+                                                     rmm::cuda_stream_view stream,
                                                      rmm::mr::device_memory_resource* mr)
 {
-  return detail::bitmask_and(view, cudf::get_default_stream(), mr);
+  CUDF_FUNC_RANGE();
+  return detail::bitmask_and(view, stream, mr);
 }
 
 std::pair<rmm::device_buffer, size_type> bitmask_or(table_view const& view,
+                                                    rmm::cuda_stream_view stream,
                                                     rmm::mr::device_memory_resource* mr)
 {
-  return detail::bitmask_or(view, cudf::get_default_stream(), mr);
+  CUDF_FUNC_RANGE();
+  return detail::bitmask_or(view, stream, mr);
 }
 
 // Count non-zero bits in the specified range
-cudf::size_type null_count(bitmask_type const* bitmask, size_type start, size_type stop)
+cudf::size_type null_count(bitmask_type const* bitmask,
+                           size_type start,
+                           size_type stop,
+                           rmm::cuda_stream_view stream)
 {
-  return detail::null_count(bitmask, start, stop, cudf::get_default_stream());
+  CUDF_FUNC_RANGE();
+  return detail::null_count(bitmask, start, stop, stream);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu
index d08c3025553..9b9e780965a 100644
--- a/cpp/src/copying/concatenate.cu
+++ b/cpp/src/copying/concatenate.cu
@@ -563,7 +563,7 @@ rmm::device_buffer concatenate_masks(host_span<column_view const> views,
       });
 
     rmm::device_buffer null_mask =
-      create_null_mask(total_element_count, mask_state::UNINITIALIZED, mr);
+      cudf::detail::create_null_mask(total_element_count, mask_state::UNINITIALIZED, stream, mr);
 
     detail::concatenate_masks(views, static_cast<bitmask_type*>(null_mask.data()), stream);
 
diff --git a/cpp/src/copying/scatter.cu b/cpp/src/copying/scatter.cu
index 11c27fc86e3..879ddb5048e 100644
--- a/cpp/src/copying/scatter.cu
+++ b/cpp/src/copying/scatter.cu
@@ -268,8 +268,9 @@ struct column_scalar_scatterer_impl<struct_view, MapIterator> {
 
     // Compute null mask
     rmm::device_buffer null_mask =
-      target.nullable() ? copy_bitmask(target, stream, mr)
-                        : create_null_mask(target.size(), mask_state::UNALLOCATED, stream, mr);
+      target.nullable()
+        ? detail::copy_bitmask(target, stream, mr)
+        : detail::create_null_mask(target.size(), mask_state::UNALLOCATED, stream, mr);
     column null_mask_stub(data_type{type_id::STRUCT},
                           target.size(),
                           rmm::device_buffer{},
diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index 506832881a9..195c8924c9a 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -410,7 +410,8 @@ void sparse_to_dense_results(table_view const& keys,
                              rmm::cuda_stream_view stream,
                              rmm::mr::device_memory_resource* mr)
 {
-  auto row_bitmask = bitmask_and(keys, stream, rmm::mr::get_current_device_resource()).first;
+  auto row_bitmask =
+    cudf::detail::bitmask_and(keys, stream, rmm::mr::get_current_device_resource()).first;
   bool skip_key_rows_with_nulls = keys_have_nulls and include_null_keys == null_policy::EXCLUDE;
   bitmask_type const* row_bitmask_ptr =
     skip_key_rows_with_nulls ? static_cast<bitmask_type*>(row_bitmask.data()) : nullptr;
diff --git a/cpp/src/groupby/sort/group_quantiles.cu b/cpp/src/groupby/sort/group_quantiles.cu
index a9edcfecbf7..a456d4b5964 100644
--- a/cpp/src/groupby/sort/group_quantiles.cu
+++ b/cpp/src/groupby/sort/group_quantiles.cu
@@ -49,6 +49,7 @@ struct calculate_quantile_fn {
   double const* d_quantiles;
   size_type num_quantiles;
   interpolation interpolation;
+  size_type* null_count;
 
   __device__ void operator()(size_type i)
   {
@@ -68,11 +69,13 @@ struct calculate_quantile_fn {
     thrust::for_each_n(thrust::seq,
                        thrust::make_counting_iterator(0),
                        num_quantiles,
-                       [d_result = d_result, segment_size, offset](size_type j) {
-                         if (segment_size == 0)
+                       [d_result = d_result, segment_size, offset, this](size_type j) {
+                         if (segment_size == 0) {
                            d_result.set_null(offset + j);
-                         else
+                           atomicAdd(this->null_count, 1);
+                         } else {
                            d_result.set_valid(offset + j);
+                         }
                        });
   }
 };
@@ -104,6 +107,7 @@ struct quantiles_functor {
     auto values_view     = column_device_view::create(values, stream);
     auto group_size_view = column_device_view::create(group_sizes, stream);
     auto result_view     = mutable_column_device_view::create(result->mutable_view(), stream);
+    auto null_count      = rmm::device_scalar<cudf::size_type>(0, stream, mr);
 
     // For each group, calculate quantile
     if (!cudf::is_dictionary(values.type())) {
@@ -118,7 +122,8 @@ struct quantiles_functor {
                            group_offsets.data(),
                            quantile.data(),
                            static_cast<size_type>(quantile.size()),
-                           interpolation});
+                           interpolation,
+                           null_count.data()});
     } else {
       auto values_iter = cudf::dictionary::detail::make_dictionary_iterator<T>(*values_view);
       thrust::for_each_n(rmm::exec_policy(stream),
@@ -131,9 +136,11 @@ struct quantiles_functor {
                            group_offsets.data(),
                            quantile.data(),
                            static_cast<size_type>(quantile.size()),
-                           interpolation});
+                           interpolation,
+                           null_count.data()});
     }
 
+    result->set_null_count(null_count.value(stream));
     return result;
   }
 
diff --git a/cpp/src/hash/unordered_multiset.cuh b/cpp/src/hash/unordered_multiset.cuh
deleted file mode 100644
index 183042fc0f4..00000000000
--- a/cpp/src/hash/unordered_multiset.cuh
+++ /dev/null
@@ -1,159 +0,0 @@
-/*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <cudf/column/column_device_view.cuh>
-#include <cudf/detail/utilities/vector_factories.hpp>
-#include <cudf/hashing/detail/default_hash.cuh>
-#include <cudf/hashing/detail/helper_functions.cuh>
-#include <cudf/utilities/span.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_uvector.hpp>
-#include <rmm/exec_policy.hpp>
-
-#include <thrust/copy.h>
-#include <thrust/for_each.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/scan.h>
-
-#include <cuda/atomic>
-
-namespace cudf {
-namespace detail {
-/*
- *  Device view of the unordered multiset
- */
-template <typename Element,
-          typename Hasher   = cudf::hashing::detail::default_hash<Element>,
-          typename Equality = equal_to<Element>>
-class unordered_multiset_device_view {
- public:
-  unordered_multiset_device_view(size_type hash_size,
-                                 size_type const* hash_begin,
-                                 Element const* hash_data)
-    : hash_size{hash_size}, hash_begin{hash_begin}, hash_data{hash_data}, hasher(), equals()
-  {
-  }
-
-  bool __device__ contains(Element e) const
-  {
-    size_type loc = hasher(e) % (2 * hash_size);
-
-    for (size_type i = hash_begin[loc]; i < hash_begin[loc + 1]; ++i) {
-      if (equals(hash_data[i], e)) return true;
-    }
-
-    return false;
-  }
-
- private:
-  Hasher hasher;
-  Equality equals;
-  size_type hash_size;
-  size_type const* hash_begin;
-  Element const* hash_data;
-};
-
-/*
- * Fixed size set on a device.
- */
-template <typename Element,
-          typename Hasher   = cudf::hashing::detail::default_hash<Element>,
-          typename Equality = equal_to<Element>>
-class unordered_multiset {
- public:
-  /**
-   * @brief Factory to construct a new unordered_multiset
-   */
-  static unordered_multiset<Element> create(column_view const& col, rmm::cuda_stream_view stream)
-  {
-    auto d_column = column_device_view::create(col, stream);
-    auto d_col    = *d_column;
-
-    auto hash_bins_start = cudf::detail::make_zeroed_device_uvector_async<size_type>(
-      2 * d_col.size() + 1, stream, rmm::mr::get_current_device_resource());
-    auto hash_bins_end = cudf::detail::make_zeroed_device_uvector_async<size_type>(
-      2 * d_col.size() + 1, stream, rmm::mr::get_current_device_resource());
-    auto hash_data = rmm::device_uvector<Element>(d_col.size(), stream);
-
-    Hasher hasher;
-    size_type* d_hash_bins_start = hash_bins_start.data();
-    size_type* d_hash_bins_end   = hash_bins_end.data();
-    Element* d_hash_data         = hash_data.data();
-
-    thrust::for_each(
-      rmm::exec_policy(stream),
-      thrust::make_counting_iterator<size_type>(0),
-      thrust::make_counting_iterator<size_type>(col.size()),
-      [d_hash_bins_start, d_col, hasher] __device__(size_t idx) {
-        if (!d_col.is_null(idx)) {
-          Element e     = d_col.element<Element>(idx);
-          size_type tmp = hasher(e) % (2 * d_col.size());
-          cuda::atomic_ref<size_type, cuda::thread_scope_device> ref{*(d_hash_bins_start + tmp)};
-          ref.fetch_add(1, cuda::std::memory_order_relaxed);
-        }
-      });
-
-    thrust::exclusive_scan(rmm::exec_policy(stream),
-                           hash_bins_start.begin(),
-                           hash_bins_start.end(),
-                           hash_bins_end.begin());
-
-    thrust::copy(rmm::exec_policy(stream),
-                 hash_bins_end.begin(),
-                 hash_bins_end.end(),
-                 hash_bins_start.begin());
-
-    thrust::for_each(
-      rmm::exec_policy(stream),
-      thrust::make_counting_iterator<size_type>(0),
-      thrust::make_counting_iterator<size_type>(col.size()),
-      [d_hash_bins_end, d_hash_data, d_col, hasher] __device__(size_t idx) {
-        if (!d_col.is_null(idx)) {
-          Element e     = d_col.element<Element>(idx);
-          size_type tmp = hasher(e) % (2 * d_col.size());
-          cuda::atomic_ref<size_type, cuda::thread_scope_device> ref{*(d_hash_bins_end + tmp)};
-          size_type offset    = ref.fetch_add(1, cuda::std::memory_order_relaxed);
-          d_hash_data[offset] = e;
-        }
-      });
-
-    return unordered_multiset(d_col.size(), std::move(hash_bins_start), std::move(hash_data));
-  }
-
-  unordered_multiset_device_view<Element, Hasher, Equality> to_device() const
-  {
-    return unordered_multiset_device_view<Element, Hasher, Equality>(
-      size, hash_bins.data(), hash_data.data());
-  }
-
- private:
-  unordered_multiset(size_type size,
-                     rmm::device_uvector<size_type>&& hash_bins,
-                     rmm::device_uvector<Element>&& hash_data)
-    : size{size}, hash_bins{std::move(hash_bins)}, hash_data{std::move(hash_data)}
-  {
-  }
-
-  size_type size;
-  rmm::device_uvector<size_type> hash_bins;
-  rmm::device_uvector<Element> hash_data;
-};
-
-}  // namespace detail
-}  // namespace cudf
diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu
index 0cd750bc947..3a9fe50d25b 100644
--- a/cpp/src/interop/to_arrow.cu
+++ b/cpp/src/interop/to_arrow.cu
@@ -197,7 +197,9 @@ std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<numeric::decimal32>(
   arrow::MemoryPool* ar_mr,
   rmm::cuda_stream_view stream)
 {
-  return unsupported_decimals_to_arrow<int32_t>(input, 9, ar_mr, stream);
+  using DeviceType = int32_t;
+  return unsupported_decimals_to_arrow<DeviceType>(
+    input, cudf::detail::max_precision<DeviceType>(), ar_mr, stream);
 }
 
 template <>
@@ -208,7 +210,9 @@ std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<numeric::decimal64>(
   arrow::MemoryPool* ar_mr,
   rmm::cuda_stream_view stream)
 {
-  return unsupported_decimals_to_arrow<int64_t>(input, 18, ar_mr, stream);
+  using DeviceType = int64_t;
+  return unsupported_decimals_to_arrow<DeviceType>(
+    input, cudf::detail::max_precision<DeviceType>(), ar_mr, stream);
 }
 
 template <>
@@ -219,7 +223,8 @@ std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<numeric::decimal128>
   arrow::MemoryPool* ar_mr,
   rmm::cuda_stream_view stream)
 {
-  using DeviceType = __int128_t;
+  using DeviceType         = __int128_t;
+  auto const max_precision = cudf::detail::max_precision<DeviceType>();
 
   rmm::device_uvector<DeviceType> buf(input.size(), stream);
 
@@ -234,7 +239,7 @@ std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<numeric::decimal128>
   CUDF_CUDA_TRY(cudaMemcpyAsync(
     data_buffer->mutable_data(), buf.data(), buf_size_in_bytes, cudaMemcpyDefault, stream.value()));
 
-  auto type    = arrow::decimal(18, -input.type().scale());
+  auto type    = arrow::decimal(max_precision, -input.type().scale());
   auto mask    = fetch_mask_buffer(input, ar_mr, stream);
   auto buffers = std::vector<std::shared_ptr<arrow::Buffer>>{mask, std::move(data_buffer)};
   auto data    = std::make_shared<arrow::ArrayData>(type, input.size(), buffers);
@@ -377,10 +382,10 @@ std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<cudf::dictionary32>(
 {
   // Arrow dictionary requires indices to be signed integer
   std::unique_ptr<column> dict_indices =
-    cast(cudf::dictionary_column_view(input).get_indices_annotated(),
-         cudf::data_type{type_id::INT32},
-         stream,
-         rmm::mr::get_current_device_resource());
+    detail::cast(cudf::dictionary_column_view(input).get_indices_annotated(),
+                 cudf::data_type{type_id::INT32},
+                 stream,
+                 rmm::mr::get_current_device_resource());
   auto indices = dispatch_to_arrow{}.operator()<int32_t>(
     dict_indices->view(), dict_indices->type().id(), {}, ar_mr, stream);
   auto dict_keys = cudf::dictionary_column_view(input).keys();
diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu
index 8c586306ad5..6e9c634804c 100644
--- a/cpp/src/io/csv/writer_impl.cu
+++ b/cpp/src/io/csv/writer_impl.cu
@@ -146,6 +146,12 @@ struct column_to_strings_fn {
   {
   }
 
+  ~column_to_strings_fn()                                      = default;
+  column_to_strings_fn(column_to_strings_fn const&)            = delete;
+  column_to_strings_fn& operator=(column_to_strings_fn const&) = delete;
+  column_to_strings_fn(column_to_strings_fn&&)                 = delete;
+  column_to_strings_fn& operator=(column_to_strings_fn&&)      = delete;
+
   // Note: `null` replacement with `na_rep` deferred to `concatenate()`
   // instead of column-wise; might be faster
   //
@@ -160,8 +166,9 @@ struct column_to_strings_fn {
   std::enable_if_t<std::is_same_v<column_type, bool>, std::unique_ptr<column>> operator()(
     column_view const& column) const
   {
-    return cudf::strings::detail::from_booleans(
-      column, options_.get_true_value(), options_.get_false_value(), stream_, mr_);
+    string_scalar true_string{options_.get_true_value(), true, stream_};
+    string_scalar false_string{options_.get_false_value(), true, stream_};
+    return cudf::strings::detail::from_booleans(column, true_string, false_string, stream_, mr_);
   }
 
   // strings:
@@ -367,10 +374,10 @@ void write_chunked(data_sink* out_sink,
 
   CUDF_EXPECTS(str_column_view.size() > 0, "Unexpected empty strings column.");
 
-  cudf::string_scalar newline{options.get_line_terminator()};
+  cudf::string_scalar newline{options.get_line_terminator(), true, stream};
   auto p_str_col_w_nl = cudf::strings::detail::join_strings(str_column_view,
                                                             newline,
-                                                            string_scalar("", false),
+                                                            string_scalar{"", false, stream},
                                                             stream,
                                                             rmm::mr::get_current_device_resource());
   strings_column_view strings_column{p_str_col_w_nl->view()};
@@ -455,12 +462,14 @@ void write_csv(data_sink* out_sink,
 
       // populate vector of string-converted columns:
       //
-      std::transform(sub_view.begin(),
-                     sub_view.end(),
-                     std::back_inserter(str_column_vec),
-                     [converter](auto const& current_col) {
-                       return cudf::type_dispatcher(current_col.type(), converter, current_col);
-                     });
+      std::transform(
+        sub_view.begin(),
+        sub_view.end(),
+        std::back_inserter(str_column_vec),
+        [&converter = std::as_const(converter)](auto const& current_col) {
+          return cudf::type_dispatcher<cudf::id_to_type_impl, column_to_strings_fn const&>(
+            current_col.type(), converter, current_col);
+        });
 
       // create string table view from str_column_vec:
       //
@@ -470,18 +479,19 @@ void write_csv(data_sink* out_sink,
       // concatenate columns in each row into one big string column
       // (using null representation and delimiter):
       //
-      std::string delimiter_str{options.get_inter_column_delimiter()};
       auto str_concat_col = [&] {
+        cudf::string_scalar delimiter_str{
+          std::string{options.get_inter_column_delimiter()}, true, stream};
+        cudf::string_scalar options_narep{options.get_na_rep(), true, stream};
         if (str_table_view.num_columns() > 1)
           return cudf::strings::detail::concatenate(str_table_view,
                                                     delimiter_str,
-                                                    options.get_na_rep(),
+                                                    options_narep,
                                                     strings::separator_on_nulls::YES,
                                                     stream,
                                                     rmm::mr::get_current_device_resource());
-        cudf::string_scalar narep{options.get_na_rep()};
         return cudf::strings::detail::replace_nulls(
-          str_table_view.column(0), narep, stream, rmm::mr::get_current_device_resource());
+          str_table_view.column(0), options_narep, stream, rmm::mr::get_current_device_resource());
       }();
 
       write_chunked(out_sink, str_concat_col->view(), options, stream, mr);
diff --git a/cpp/src/io/fst/logical_stack.cuh b/cpp/src/io/fst/logical_stack.cuh
index c4f99736306..22385d33c7b 100644
--- a/cpp/src/io/fst/logical_stack.cuh
+++ b/cpp/src/io/fst/logical_stack.cuh
@@ -28,6 +28,7 @@
 #include <thrust/device_ptr.h>
 #include <thrust/execution_policy.h>
 #include <thrust/fill.h>
+#include <thrust/iterator/transform_output_iterator.h>
 #include <thrust/scatter.h>
 
 #include <cub/cub.cuh>
@@ -48,6 +49,14 @@ enum class stack_op_type : int8_t {
   RESET = 3   ///< Operation popping all items currently on the stack
 };
 
+/**
+ * @brief Describes the kind of stack operations supported by the logical stack.
+ */
+enum class stack_op_support : bool {
+  NO_RESET_SUPPORT   = false,  ///< A stack that only supports push(x) and pop() operations
+  WITH_RESET_SUPPORT = true    ///< A stack that supports push(x), pop(), and reset() operations
+};
+
 namespace detail {
 
 /**
@@ -130,6 +139,37 @@ struct StackSymbolToStackOp {
   StackSymbolToStackOpTypeT symbol_to_stack_op_type;
 };
 
+/**
+ * @brief Function object that maps a stack `reset` operation to `1`.
+ */
+template <typename StackSymbolToStackOpTypeT>
+struct NewlineToResetStackSegmentOp {
+  template <typename StackSymbolT>
+  constexpr CUDF_HOST_DEVICE uint32_t operator()(StackSymbolT const& stack_symbol) const
+  {
+    stack_op_type stack_op = symbol_to_stack_op_type(stack_symbol);
+
+    // Every reset operation marks the beginning of a new segment
+    return (stack_op == stack_op_type::RESET) ? 1 : 0;
+  }
+
+  /// Function object returning a stack operation type for a given stack symbol
+  StackSymbolToStackOpTypeT symbol_to_stack_op_type;
+};
+
+/**
+ * @brief Function object that wraps around for values that exceed the largest value of `TargetT`
+ */
+template <typename TargetT>
+struct ModToTargetTypeOpT {
+  template <typename T>
+  constexpr CUDF_HOST_DEVICE TargetT operator()(T const& val) const
+  {
+    return static_cast<TargetT>(
+      val % (static_cast<T>(cuda::std::numeric_limits<TargetT>::max()) + static_cast<T>(1)));
+  }
+};
+
 /**
  * @brief Binary reduction operator to compute the absolute stack level from relative stack levels
  * (i.e., +1 for a PUSH, -1 for a POP operation).
@@ -140,9 +180,7 @@ struct AddStackLevelFromStackOp {
   constexpr CUDF_HOST_DEVICE StackOp<StackLevelT, ValueT> operator()(
     StackOp<StackLevelT, ValueT> const& lhs, StackOp<StackLevelT, ValueT> const& rhs) const
   {
-    StackLevelT new_level = (symbol_to_stack_op_type(rhs.value) == stack_op_type::RESET)
-                              ? 0
-                              : (lhs.stack_level + rhs.stack_level);
+    StackLevelT new_level = lhs.stack_level + rhs.stack_level;
     return StackOp<StackLevelT, ValueT>{new_level, rhs.value};
   }
 
@@ -230,6 +268,8 @@ struct RemapEmptyStack {
  * onto the stack or pop something from the stack and resolves the symbol that is on top of the
  * stack.
  *
+ * @tparam SupportResetOperation Whether the logical stack also supports `reset` operations that
+ * reset the stack to the empty stack
  * @tparam StackLevelT Signed integer type that must be sufficient to cover [-max_stack_level,
  * max_stack_level] for the given sequence of stack operations. Must be signed as it needs to cover
  * the stack level of any arbitrary subsequence of stack operations.
@@ -261,7 +301,8 @@ struct RemapEmptyStack {
  * what-is-on-top-of-the-stack
  * @param[in] stream The cuda stream to which to dispatch the work
  */
-template <typename StackLevelT,
+template <stack_op_support SupportResetOperation,
+          typename StackLevelT,
           typename StackSymbolItT,
           typename SymbolPositionT,
           typename StackSymbolToStackOpTypeT,
@@ -281,6 +322,9 @@ void sparse_stack_op_to_top_of_stack(StackSymbolItT d_symbols,
   // Type used to hold pairs of (stack_level, value) pairs
   using StackOpT = detail::StackOp<StackLevelT, StackSymbolT>;
 
+  // Type used to mark *-by-key segments after `reset` operations
+  using StackSegmentT = uint8_t;
+
   // The unsigned integer type that we use for radix sorting items of type StackOpT
   using StackOpUnsignedT = detail::UnsignedStackOpType<StackOpT>;
   static_assert(!std::is_void<StackOpUnsignedT>(), "unsupported StackOpT size");
@@ -292,6 +336,8 @@ void sparse_stack_op_to_top_of_stack(StackSymbolItT d_symbols,
   using TransformInputItT =
     cub::TransformInputIterator<StackOpT, StackSymbolToStackOpT, StackSymbolItT>;
 
+  constexpr bool supports_reset_op = SupportResetOperation == stack_op_support::WITH_RESET_SUPPORT;
+
   auto const num_symbols_in = d_symbol_positions.size();
 
   // Converting a stack symbol that may either push or pop to a stack operation:
@@ -330,14 +376,44 @@ void sparse_stack_op_to_top_of_stack(StackSymbolItT d_symbols,
 
   // Getting temporary storage requirements for the prefix sum of the stack level after each
   // operation
-  CUDF_CUDA_TRY(cub::DeviceScan::InclusiveScan(
-    nullptr,
-    stack_level_scan_bytes,
-    stack_symbols_in,
-    d_kv_operations.Current(),
-    detail::AddStackLevelFromStackOp<StackSymbolToStackOpTypeT>{symbol_to_stack_op},
-    num_symbols_in,
-    stream));
+  if constexpr (supports_reset_op) {
+    // Iterator that returns `1` for every symbol that corresponds to a `reset` operation
+    auto reset_segments_it = thrust::make_transform_iterator(
+      d_symbols,
+      detail::NewlineToResetStackSegmentOp<StackSymbolToStackOpTypeT>{symbol_to_stack_op});
+
+    auto const fake_key_segment_it      = static_cast<StackSegmentT*>(nullptr);
+    std::size_t gen_segments_scan_bytes = 0;
+    std::size_t scan_by_key_bytes       = 0;
+    CUDF_CUDA_TRY(cub::DeviceScan::InclusiveSum(
+      nullptr,
+      gen_segments_scan_bytes,
+      reset_segments_it,
+      thrust::make_transform_output_iterator(fake_key_segment_it,
+                                             detail::ModToTargetTypeOpT<StackSegmentT>{}),
+      num_symbols_in,
+      stream));
+    CUDF_CUDA_TRY(cub::DeviceScan::InclusiveScanByKey(
+      nullptr,
+      scan_by_key_bytes,
+      fake_key_segment_it,
+      stack_symbols_in,
+      d_kv_operations.Current(),
+      detail::AddStackLevelFromStackOp<StackSymbolToStackOpTypeT>{symbol_to_stack_op},
+      num_symbols_in,
+      cub::Equality{},
+      stream));
+    stack_level_scan_bytes = std::max(gen_segments_scan_bytes, scan_by_key_bytes);
+  } else {
+    CUDF_CUDA_TRY(cub::DeviceScan::InclusiveScan(
+      nullptr,
+      stack_level_scan_bytes,
+      stack_symbols_in,
+      d_kv_operations.Current(),
+      detail::AddStackLevelFromStackOp<StackSymbolToStackOpTypeT>{symbol_to_stack_op},
+      num_symbols_in,
+      stream));
+  }
 
   // Getting temporary storage requirements for the stable radix sort (sorting by stack level of the
   // operations)
@@ -401,14 +477,41 @@ void sparse_stack_op_to_top_of_stack(StackSymbolItT d_symbols,
   d_kv_operations = cub::DoubleBuffer<StackOpT>{d_kv_ops_current.data(), d_kv_ops_alt.data()};
 
   // Compute prefix sum of the stack level after each operation
-  CUDF_CUDA_TRY(cub::DeviceScan::InclusiveScan(
-    temp_storage.data(),
-    total_temp_storage_bytes,
-    stack_symbols_in,
-    d_kv_operations.Current(),
-    detail::AddStackLevelFromStackOp<StackSymbolToStackOpTypeT>{symbol_to_stack_op},
-    num_symbols_in,
-    stream));
+  if constexpr (supports_reset_op) {
+    // Iterator that returns `1` for every symbol that corresponds to a `reset` operation
+    auto reset_segments_it = thrust::make_transform_iterator(
+      d_symbols,
+      detail::NewlineToResetStackSegmentOp<StackSymbolToStackOpTypeT>{symbol_to_stack_op});
+
+    rmm::device_uvector<StackSegmentT> key_segments{num_symbols_in, stream};
+    CUDF_CUDA_TRY(cub::DeviceScan::InclusiveSum(
+      temp_storage.data(),
+      total_temp_storage_bytes,
+      reset_segments_it,
+      thrust::make_transform_output_iterator(key_segments.data(),
+                                             detail::ModToTargetTypeOpT<StackSegmentT>{}),
+      num_symbols_in,
+      stream));
+    CUDF_CUDA_TRY(cub::DeviceScan::InclusiveScanByKey(
+      temp_storage.data(),
+      total_temp_storage_bytes,
+      key_segments.data(),
+      stack_symbols_in,
+      d_kv_operations.Current(),
+      detail::AddStackLevelFromStackOp<StackSymbolToStackOpTypeT>{symbol_to_stack_op},
+      num_symbols_in,
+      cub::Equality{},
+      stream));
+  } else {
+    CUDF_CUDA_TRY(cub::DeviceScan::InclusiveScan(
+      temp_storage.data(),
+      total_temp_storage_bytes,
+      stack_symbols_in,
+      d_kv_operations.Current(),
+      detail::AddStackLevelFromStackOp<StackSymbolToStackOpTypeT>{symbol_to_stack_op},
+      num_symbols_in,
+      stream));
+  }
 
   // Stable radix sort, sorting by stack level of the operations
   d_kv_operations_unsigned = cub::DoubleBuffer<StackOpUnsignedT>{
diff --git a/cpp/src/io/fst/lookup_tables.cuh b/cpp/src/io/fst/lookup_tables.cuh
index 37c99453361..42036b79751 100644
--- a/cpp/src/io/fst/lookup_tables.cuh
+++ b/cpp/src/io/fst/lookup_tables.cuh
@@ -753,7 +753,7 @@ class TranslationOp {
                                              RelativeOffsetT const relative_offset,
                                              SymbolT const read_symbol) const
   {
-    return translation_op(*this, state_id, match_id, relative_offset, read_symbol);
+    return translation_op(state_id, match_id, relative_offset, read_symbol);
   }
 
   template <typename StateIndexT, typename SymbolIndexT, typename SymbolT>
@@ -761,7 +761,7 @@ class TranslationOp {
                                              SymbolIndexT const match_id,
                                              SymbolT const read_symbol) const
   {
-    return translation_op(*this, state_id, match_id, read_symbol);
+    return translation_op(state_id, match_id, read_symbol);
   }
 };
 
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index 392a7850886..964e40e36cd 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -41,9 +41,9 @@
 namespace cudf {
 namespace io {
 // Returns builder for csv_reader_options
-csv_reader_options_builder csv_reader_options::builder(source_info const& src)
+csv_reader_options_builder csv_reader_options::builder(source_info src)
 {
-  return csv_reader_options_builder{src};
+  return csv_reader_options_builder{std::move(src)};
 }
 
 // Returns builder for csv_writer_options
@@ -54,9 +54,9 @@ csv_writer_options_builder csv_writer_options::builder(sink_info const& sink,
 }
 
 // Returns builder for orc_reader_options
-orc_reader_options_builder orc_reader_options::builder(source_info const& src)
+orc_reader_options_builder orc_reader_options::builder(source_info src)
 {
-  return orc_reader_options_builder{src};
+  return orc_reader_options_builder{std::move(src)};
 }
 
 // Returns builder for orc_writer_options
@@ -73,15 +73,15 @@ chunked_orc_writer_options_builder chunked_orc_writer_options::builder(sink_info
 }
 
 // Returns builder for avro_reader_options
-avro_reader_options_builder avro_reader_options::builder(source_info const& src)
+avro_reader_options_builder avro_reader_options::builder(source_info src)
 {
-  return avro_reader_options_builder(src);
+  return avro_reader_options_builder(std::move(src));
 }
 
 // Returns builder for json_reader_options
-json_reader_options_builder json_reader_options::builder(source_info const& src)
+json_reader_options_builder json_reader_options::builder(source_info src)
 {
-  return json_reader_options_builder(src);
+  return json_reader_options_builder(std::move(src));
 }
 
 // Returns builder for orc_writer_options
@@ -92,9 +92,9 @@ json_writer_options_builder json_writer_options::builder(sink_info const& sink,
 }
 
 // Returns builder for parquet_reader_options
-parquet_reader_options_builder parquet_reader_options::builder(source_info const& src)
+parquet_reader_options_builder parquet_reader_options::builder(source_info src)
 {
-  return parquet_reader_options_builder{src};
+  return parquet_reader_options_builder{std::move(src)};
 }
 
 // Returns builder for parquet_writer_options
@@ -200,7 +200,9 @@ compression_type infer_compression_type(compression_type compression, source_inf
   return compression_type::NONE;
 }
 
-table_with_metadata read_json(json_reader_options options, rmm::mr::device_memory_resource* mr)
+table_with_metadata read_json(json_reader_options options,
+                              rmm::cuda_stream_view stream,
+                              rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
 
@@ -210,10 +212,12 @@ table_with_metadata read_json(json_reader_options options, rmm::mr::device_memor
                                       options.get_byte_range_offset(),
                                       options.get_byte_range_size_with_padding());
 
-  return json::detail::read_json(datasources, options, cudf::get_default_stream(), mr);
+  return json::detail::read_json(datasources, options, stream, mr);
 }
 
-void write_json(json_writer_options const& options, rmm::mr::device_memory_resource* mr)
+void write_json(json_writer_options const& options,
+                rmm::cuda_stream_view stream,
+                rmm::mr::device_memory_resource* mr)
 {
   auto sinks = make_datasinks(options.get_sink());
   CUDF_EXPECTS(sinks.size() == 1, "Multiple sinks not supported for JSON writing");
@@ -222,11 +226,13 @@ void write_json(json_writer_options const& options, rmm::mr::device_memory_resou
     sinks[0].get(),
     options.get_table(),
     options,
-    cudf::get_default_stream(),
+    stream,
     mr);
 }
 
-table_with_metadata read_csv(csv_reader_options options, rmm::mr::device_memory_resource* mr)
+table_with_metadata read_csv(csv_reader_options options,
+                             rmm::cuda_stream_view stream,
+                             rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
 
@@ -241,12 +247,14 @@ table_with_metadata read_csv(csv_reader_options options, rmm::mr::device_memory_
   return cudf::io::detail::csv::read_csv(  //
     std::move(datasources[0]),
     options,
-    cudf::get_default_stream(),
+    stream,
     mr);
 }
 
 // Freeform API wraps the detail writer class API
-void write_csv(csv_writer_options const& options, rmm::mr::device_memory_resource* mr)
+void write_csv(csv_writer_options const& options,
+               rmm::cuda_stream_view stream,
+               rmm::mr::device_memory_resource* mr)
 {
   using namespace cudf::io::detail;
 
@@ -258,7 +266,7 @@ void write_csv(csv_writer_options const& options, rmm::mr::device_memory_resourc
     options.get_table(),
     options.get_names(),
     options,
-    cudf::get_default_stream(),
+    stream,
     mr);
 }
 
@@ -470,8 +478,8 @@ void orc_chunked_writer::close()
   writer->close();
 }
 
-using namespace cudf::io::detail::parquet;
-namespace detail_parquet = cudf::io::detail::parquet;
+using namespace cudf::io::parquet::detail;
+namespace detail_parquet = cudf::io::parquet::detail;
 
 table_with_metadata read_parquet(parquet_reader_options const& options,
                                  rmm::mr::device_memory_resource* mr)
diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp
index 3bbfc4b5f83..8d89f4ff927 100644
--- a/cpp/src/io/json/nested_json.hpp
+++ b/cpp/src/io/json/nested_json.hpp
@@ -20,7 +20,6 @@
 #include <cudf/io/types.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/bit.hpp>
-#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 
 #include <map>
diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu
index 06ac11485cb..496e5b25e60 100644
--- a/cpp/src/io/json/nested_json_gpu.cu
+++ b/cpp/src/io/json/nested_json_gpu.cu
@@ -91,6 +91,98 @@ void check_input_size(std::size_t input_size)
 
 namespace cudf::io::json {
 
+// FST to help fixing the stack context of characters that follow the first record on each JSON line
+namespace fix_stack_of_excess_chars {
+
+// Type used to represent the target state in the transition table
+using StateT = char;
+
+// Type used to represent a symbol group id
+using SymbolGroupT = uint8_t;
+
+/**
+ * @brief Definition of the DFA's states.
+ */
+enum class dfa_states : StateT {
+  // Before the first record on the JSON line
+  BEFORE,
+  // Within the first record on the JSON line
+  WITHIN,
+  // Excess data that follows the first record on the JSON line
+  EXCESS,
+  // Total number of states
+  NUM_STATES
+};
+
+/**
+ * @brief Definition of the symbol groups
+ */
+enum class dfa_symbol_group_id : SymbolGroupT {
+  ROOT,              ///< Symbol for root stack context
+  DELIMITER,         ///< Line delimiter symbol group
+  OTHER,             ///< Symbol group that implicitly matches all other tokens
+  NUM_SYMBOL_GROUPS  ///< Total number of symbol groups
+};
+
+constexpr auto TT_NUM_STATES     = static_cast<StateT>(dfa_states::NUM_STATES);
+constexpr auto NUM_SYMBOL_GROUPS = static_cast<uint32_t>(dfa_symbol_group_id::NUM_SYMBOL_GROUPS);
+
+/**
+ * @brief Function object to map (input_symbol,stack_context) tuples to a symbol group.
+ */
+struct SymbolPairToSymbolGroupId {
+  CUDF_HOST_DEVICE SymbolGroupT operator()(thrust::tuple<SymbolT, StackSymbolT> symbol) const
+  {
+    auto const input_symbol = thrust::get<0>(symbol);
+    auto const stack_symbol = thrust::get<1>(symbol);
+    return static_cast<SymbolGroupT>(
+      input_symbol == '\n'
+        ? dfa_symbol_group_id::DELIMITER
+        : (stack_symbol == '_' ? dfa_symbol_group_id::ROOT : dfa_symbol_group_id::OTHER));
+  }
+};
+
+/**
+ * @brief Translation function object that fixes the stack context of excess data that follows after
+ * the first JSON record on each line.
+ */
+struct TransduceInputOp {
+  template <typename RelativeOffsetT, typename SymbolT>
+  constexpr CUDF_HOST_DEVICE StackSymbolT operator()(StateT const state_id,
+                                                     SymbolGroupT const match_id,
+                                                     RelativeOffsetT const relative_offset,
+                                                     SymbolT const read_symbol) const
+  {
+    if (state_id == static_cast<StateT>(dfa_states::EXCESS)) { return '_'; }
+    return thrust::get<1>(read_symbol);
+  }
+
+  template <typename SymbolT>
+  constexpr CUDF_HOST_DEVICE int32_t operator()(StateT const state_id,
+                                                SymbolGroupT const match_id,
+                                                SymbolT const read_symbol) const
+  {
+    constexpr int32_t single_output_item = 1;
+    return single_output_item;
+  }
+};
+
+// Aliases for readability of the transition table
+constexpr auto TT_BEFORE = dfa_states::BEFORE;
+constexpr auto TT_INSIDE = dfa_states::WITHIN;
+constexpr auto TT_EXCESS = dfa_states::EXCESS;
+
+// Transition table
+std::array<std::array<dfa_states, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> constexpr transition_table{
+  {/* IN_STATE            ROOT      NEWLINE     OTHER */
+   /* TT_BEFORE    */ {{TT_BEFORE, TT_BEFORE, TT_INSIDE}},
+   /* TT_INSIDE    */ {{TT_EXCESS, TT_BEFORE, TT_INSIDE}},
+   /* TT_EXCESS    */ {{TT_EXCESS, TT_BEFORE, TT_EXCESS}}}};
+
+// The DFA's starting state
+constexpr auto start_state = static_cast<StateT>(dfa_states::BEFORE);
+}  // namespace fix_stack_of_excess_chars
+
 // FST to prune tokens of invalid lines for recovering JSON lines format
 namespace token_filter {
 
@@ -146,9 +238,8 @@ struct UnwrapTokenFromSymbolOp {
  * invalid lines.
  */
 struct TransduceToken {
-  template <typename TransducerTableT, typename RelativeOffsetT, typename SymbolT>
-  constexpr CUDF_HOST_DEVICE SymbolT operator()(TransducerTableT const&,
-                                                StateT const state_id,
+  template <typename RelativeOffsetT, typename SymbolT>
+  constexpr CUDF_HOST_DEVICE SymbolT operator()(StateT const state_id,
                                                 SymbolGroupT const match_id,
                                                 RelativeOffsetT const relative_offset,
                                                 SymbolT const read_symbol) const
@@ -165,9 +256,8 @@ struct TransduceToken {
     }
   }
 
-  template <typename TransducerTableT, typename SymbolT>
-  constexpr CUDF_HOST_DEVICE int32_t operator()(TransducerTableT const&,
-                                                StateT const state_id,
+  template <typename SymbolT>
+  constexpr CUDF_HOST_DEVICE int32_t operator()(StateT const state_id,
                                                 SymbolGroupT const match_id,
                                                 SymbolT const read_symbol) const
   {
@@ -253,27 +343,35 @@ constexpr auto NUM_SYMBOL_GROUPS = static_cast<uint32_t>(dfa_symbol_group_id::NU
 std::array<std::string, NUM_SYMBOL_GROUPS - 1> const symbol_groups{
   {{"{"}, {"["}, {"}"}, {"]"}, {"\""}, {"\\"}, {"\n"}}};
 
-// Transition table
+// Transition table for the default JSON and JSON lines formats
 std::array<std::array<dfa_states, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> const transition_table{
   {/* IN_STATE          {       [       }       ]       "       \      \n    OTHER */
    /* TT_OOS    */ {{TT_OOS, TT_OOS, TT_OOS, TT_OOS, TT_STR, TT_OOS, TT_OOS, TT_OOS}},
    /* TT_STR    */ {{TT_STR, TT_STR, TT_STR, TT_STR, TT_OOS, TT_ESC, TT_STR, TT_STR}},
    /* TT_ESC    */ {{TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_STR}}}};
 
-// Translation table (i.e., for each transition, what are the symbols that we output)
+// Transition table for the JSON lines format that recovers from invalid JSON lines
+std::array<std::array<dfa_states, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> const
+  resetting_transition_table{
+    {/* IN_STATE          {       [       }       ]       "       \      \n    OTHER */
+     /* TT_OOS    */ {{TT_OOS, TT_OOS, TT_OOS, TT_OOS, TT_STR, TT_OOS, TT_OOS, TT_OOS}},
+     /* TT_STR    */ {{TT_STR, TT_STR, TT_STR, TT_STR, TT_OOS, TT_ESC, TT_OOS, TT_STR}},
+     /* TT_ESC    */ {{TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_OOS, TT_STR}}}};
+
+// Translation table for the default JSON and JSON lines formats
 std::array<std::array<std::vector<char>, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> const translation_table{
   {/* IN_STATE         {      [      }      ]      "      \     \n    OTHER */
    /* TT_OOS    */ {{{'{'}, {'['}, {'}'}, {']'}, {}, {}, {}, {}}},
    /* TT_STR    */ {{{}, {}, {}, {}, {}, {}, {}, {}}},
    /* TT_ESC    */ {{{}, {}, {}, {}, {}, {}, {}, {}}}}};
 
-// Translation table
+// Translation table for the JSON lines format that recovers from invalid JSON lines
 std::array<std::array<std::vector<char>, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> const
   resetting_translation_table{
     {/* IN_STATE         {      [      }      ]      "      \     \n    OTHER */
      /* TT_OOS    */ {{{'{'}, {'['}, {'}'}, {']'}, {}, {}, {'\n'}, {}}},
-     /* TT_STR    */ {{{}, {}, {}, {}, {}, {}, {}, {}}},
-     /* TT_ESC    */ {{{}, {}, {}, {}, {}, {}, {}, {}}}}};
+     /* TT_STR    */ {{{}, {}, {}, {}, {}, {}, {'\n'}, {}}},
+     /* TT_ESC    */ {{{}, {}, {}, {}, {}, {}, {'\n'}, {}}}}};
 
 // The DFA's starting state
 constexpr auto start_state = static_cast<StateT>(TT_OOS);
@@ -643,6 +741,11 @@ auto get_transition_table(json_format_cfg_t format)
     // PD_ANL describes the target state after a new line after encountering error state
     auto const PD_ANL = (format == json_format_cfg_t::JSON_LINES_RECOVER) ? PD_BOV : PD_ERR;
 
+    // Target state after having parsed the first JSON value on a JSON line
+    // Spark has the special need to ignore everything that comes after the first JSON object
+    // on a JSON line instead of marking those as invalid
+    auto const PD_AFS = (format == json_format_cfg_t::JSON_LINES_RECOVER) ? PD_PVL : PD_ERR;
+
     // First row:  empty stack         ("root" level of the JSON)
     // Second row: '[' on top of stack (we're parsing a list value)
     // Third row:  '{' on top of stack (we're parsing a struct value)
@@ -660,15 +763,15 @@ auto get_transition_table(json_format_cfg_t format)
       PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_BOV, PD_ERR, PD_PVL, PD_BOV, PD_LON,
       PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_ERR, PD_BFN, PD_ERR, PD_PVL, PD_BOV, PD_LON};
     pda_tt[static_cast<StateT>(pda_state_t::PD_STR)] = {
-      PD_STR, PD_STR, PD_STR, PD_STR, PD_PVL, PD_SCE, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR,
-      PD_STR, PD_STR, PD_STR, PD_STR, PD_PVL, PD_SCE, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR,
-      PD_STR, PD_STR, PD_STR, PD_STR, PD_PVL, PD_SCE, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR};
+      PD_STR, PD_STR, PD_STR, PD_STR, PD_PVL, PD_SCE, PD_STR, PD_STR, PD_STR, PD_BOV, PD_STR,
+      PD_STR, PD_STR, PD_STR, PD_STR, PD_PVL, PD_SCE, PD_STR, PD_STR, PD_STR, PD_BOV, PD_STR,
+      PD_STR, PD_STR, PD_STR, PD_STR, PD_PVL, PD_SCE, PD_STR, PD_STR, PD_STR, PD_BOV, PD_STR};
     pda_tt[static_cast<StateT>(pda_state_t::PD_SCE)] = {
-      PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR,
-      PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR,
-      PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR};
+      PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_BOV, PD_STR,
+      PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_BOV, PD_STR,
+      PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_BOV, PD_STR};
     pda_tt[static_cast<StateT>(pda_state_t::PD_PVL)] = {
-      PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_BOV, PD_ERR,
+      PD_AFS, PD_AFS, PD_AFS, PD_AFS, PD_AFS, PD_AFS, PD_AFS, PD_AFS, PD_PVL, PD_BOV, PD_AFS,
       PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_BOV, PD_ERR, PD_PVL, PD_BOV, PD_ERR,
       PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_ERR, PD_BFN, PD_ERR, PD_PVL, PD_BOV, PD_ERR};
     pda_tt[static_cast<StateT>(pda_state_t::PD_BFN)] = {
@@ -680,9 +783,9 @@ auto get_transition_table(json_format_cfg_t format)
       PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_ERR,
       PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_PFN, PD_FNE, PD_FLN, PD_FLN, PD_FLN, PD_BOV, PD_FLN};
     pda_tt[static_cast<StateT>(pda_state_t::PD_FNE)] = {
-      PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR,
-      PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR,
-      PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN};
+      PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_ERR,
+      PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_ERR,
+      PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_BOV, PD_FLN};
     pda_tt[static_cast<StateT>(pda_state_t::PD_PFN)] = {
       PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_ERR,
       PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_ERR,
@@ -697,8 +800,11 @@ auto get_transition_table(json_format_cfg_t format)
 
 /**
  * @brief Getting the translation table
+ * @param recover_from_error Whether or not the tokenizer should recover from invalid lines. If
+ * `recover_from_error` is true, invalid JSON lines end with the token sequence (`ErrorBegin`,
+ * `LineEn`) and incomplete JSON lines (e.g., `{"a":123\n`) are treated as invalid lines.
  */
-auto get_translation_table(bool include_line_delimiter)
+auto get_translation_table(bool recover_from_error)
 {
   constexpr auto StructBegin       = token_t::StructBegin;
   constexpr auto StructEnd         = token_t::StructEnd;
@@ -715,76 +821,95 @@ auto get_translation_table(bool include_line_delimiter)
   constexpr auto ErrorBegin        = token_t::ErrorBegin;
 
   /**
-   * @brief Appends token_t::LineEnd token to the given token sequence, if and only if
-   * `include_line_delimiter` is true.
+   * @brief Instead of specifying the verbose translation tables twice (i.e., once when
+   * `recover_from_error` is true and once when it is false), we use `nl_tokens` to specialize the
+   * translation table where it differs depending on the `recover_from_error` option. If and only if
+   * `recover_from_error` is true, `recovering_tokens` are returned along with a token_t::LineEnd
+   * token, otherwise `regular_tokens` is returned.
+   */
+  auto nl_tokens = [recover_from_error](std::vector<char> regular_tokens,
+                                        std::vector<char> recovering_tokens) {
+    if (recover_from_error) {
+      recovering_tokens.push_back(token_t::LineEnd);
+      return recovering_tokens;
+    }
+    return regular_tokens;
+  };
+
+  /**
+   * @brief Helper function that returns `recovering_tokens` if `recover_from_error` is true and
+   * returns `regular_tokens` otherwise. This is used to ignore excess characters after the first
+   * value in the case of JSON lines that recover from invalid lines, as Spark ignores any excess
+   * characters that follow the first record on a JSON line.
    */
-  auto nl_tokens = [include_line_delimiter](std::vector<char> tokens) {
-    if (include_line_delimiter) { tokens.push_back(token_t::LineEnd); }
-    return tokens;
+  auto alt_tokens = [recover_from_error](std::vector<char> regular_tokens,
+                                         std::vector<char> recovering_tokens) {
+    if (recover_from_error) { return recovering_tokens; }
+    return regular_tokens;
   };
 
   std::array<std::array<std::vector<char>, NUM_PDA_SGIDS>, PD_NUM_STATES> pda_tlt;
-  pda_tlt[static_cast<StateT>(pda_state_t::PD_BOV)] = {{                /*ROOT*/
+  pda_tlt[static_cast<StateT>(pda_state_t::PD_BOV)] = {{                    /*ROOT*/
+                                                        {StructBegin},      // OPENING_BRACE
+                                                        {ListBegin},        // OPENING_BRACKET
+                                                        {ErrorBegin},       // CLOSING_BRACE
+                                                        {ErrorBegin},       // CLOSING_BRACKET
+                                                        {StringBegin},      // QUOTE
+                                                        {ErrorBegin},       // ESCAPE
+                                                        {ErrorBegin},       // COMMA
+                                                        {ErrorBegin},       // COLON
+                                                        {},                 // WHITE_SPACE
+                                                        nl_tokens({}, {}),  // LINE_BREAK
+                                                        {ValueBegin},       // OTHER
+                                                        /*LIST*/
                                                         {StructBegin},  // OPENING_BRACE
                                                         {ListBegin},    // OPENING_BRACKET
                                                         {ErrorBegin},   // CLOSING_BRACE
-                                                        {ErrorBegin},   // CLOSING_BRACKET
+                                                        {ListEnd},      // CLOSING_BRACKET
                                                         {StringBegin},  // QUOTE
                                                         {ErrorBegin},   // ESCAPE
                                                         {ErrorBegin},   // COMMA
                                                         {ErrorBegin},   // COLON
                                                         {},             // WHITE_SPACE
-                                                        nl_tokens({}),  // LINE_BREAK
-                                                        {ValueBegin},   // OTHER
-                                                        /*LIST*/
+                                                        nl_tokens({}, {ErrorBegin}),  // LINE_BREAK
+                                                        {ValueBegin},                 // OTHER
+                                                        /*STRUCT*/
                                                         {StructBegin},  // OPENING_BRACE
                                                         {ListBegin},    // OPENING_BRACKET
                                                         {ErrorBegin},   // CLOSING_BRACE
-                                                        {ListEnd},      // CLOSING_BRACKET
+                                                        {ErrorBegin},   // CLOSING_BRACKET
                                                         {StringBegin},  // QUOTE
                                                         {ErrorBegin},   // ESCAPE
                                                         {ErrorBegin},   // COMMA
                                                         {ErrorBegin},   // COLON
                                                         {},             // WHITE_SPACE
-                                                        nl_tokens({}),  // LINE_BREAK
-                                                        {ValueBegin},   // OTHER
-                                                        /*STRUCT*/
-                                                        {StructBegin},   // OPENING_BRACE
-                                                        {ListBegin},     // OPENING_BRACKET
-                                                        {ErrorBegin},    // CLOSING_BRACE
-                                                        {ErrorBegin},    // CLOSING_BRACKET
-                                                        {StringBegin},   // QUOTE
-                                                        {ErrorBegin},    // ESCAPE
-                                                        {ErrorBegin},    // COMMA
-                                                        {ErrorBegin},    // COLON
-                                                        {},              // WHITE_SPACE
-                                                        nl_tokens({}),   // LINE_BREAK
-                                                        {ValueBegin}}};  // OTHER
+                                                        nl_tokens({}, {ErrorBegin}),  // LINE_BREAK
+                                                        {ValueBegin}}};               // OTHER
   pda_tlt[static_cast<StateT>(pda_state_t::PD_BOA)] = {
-    {                          /*ROOT*/
-     {ErrorBegin},             // OPENING_BRACE
-     {ErrorBegin},             // OPENING_BRACKET
-     {ErrorBegin},             // CLOSING_BRACE
-     {ErrorBegin},             // CLOSING_BRACKET
-     {ErrorBegin},             // QUOTE
-     {ErrorBegin},             // ESCAPE
-     {ErrorBegin},             // COMMA
-     {ErrorBegin},             // COLON
-     {ErrorBegin},             // WHITE_SPACE
-     nl_tokens({ErrorBegin}),  // LINE_BREAK
-     {ErrorBegin},             // OTHER
+    {                                        /*ROOT*/
+     {ErrorBegin},                           // OPENING_BRACE
+     {ErrorBegin},                           // OPENING_BRACKET
+     {ErrorBegin},                           // CLOSING_BRACE
+     {ErrorBegin},                           // CLOSING_BRACKET
+     {ErrorBegin},                           // QUOTE
+     {ErrorBegin},                           // ESCAPE
+     {ErrorBegin},                           // COMMA
+     {ErrorBegin},                           // COLON
+     {ErrorBegin},                           // WHITE_SPACE
+     nl_tokens({ErrorBegin}, {ErrorBegin}),  // LINE_BREAK
+     {ErrorBegin},                           // OTHER
      /*LIST*/
-     {StructBegin},  // OPENING_BRACE
-     {ListBegin},    // OPENING_BRACKET
-     {ErrorBegin},   // CLOSING_BRACE
-     {ListEnd},      // CLOSING_BRACKET
-     {StringBegin},  // QUOTE
-     {ErrorBegin},   // ESCAPE
-     {ErrorBegin},   // COMMA
-     {ErrorBegin},   // COLON
-     {},             // WHITE_SPACE
-     nl_tokens({}),  // LINE_BREAK
-     {ValueBegin},   // OTHER
+     {StructBegin},                // OPENING_BRACE
+     {ListBegin},                  // OPENING_BRACKET
+     {ErrorBegin},                 // CLOSING_BRACE
+     {ListEnd},                    // CLOSING_BRACKET
+     {StringBegin},                // QUOTE
+     {ErrorBegin},                 // ESCAPE
+     {ErrorBegin},                 // COMMA
+     {ErrorBegin},                 // COLON
+     {},                           // WHITE_SPACE
+     nl_tokens({}, {ErrorBegin}),  // LINE_BREAK
+     {ValueBegin},                 // OTHER
      /*STRUCT*/
      {ErrorBegin},                         // OPENING_BRACE
      {ErrorBegin},                         // OPENING_BRACKET
@@ -795,33 +920,33 @@ auto get_translation_table(bool include_line_delimiter)
      {ErrorBegin},                         // COMMA
      {ErrorBegin},                         // COLON
      {},                                   // WHITE_SPACE
-     nl_tokens({}),                        // LINE_BREAK
+     nl_tokens({}, {ErrorBegin}),          // LINE_BREAK
      {ErrorBegin}}};                       // OTHER
   pda_tlt[static_cast<StateT>(pda_state_t::PD_LON)] = {
-    {                        /*ROOT*/
-     {ErrorBegin},           // OPENING_BRACE
-     {ErrorBegin},           // OPENING_BRACKET
-     {ErrorBegin},           // CLOSING_BRACE
-     {ErrorBegin},           // CLOSING_BRACKET
-     {ErrorBegin},           // QUOTE
-     {ErrorBegin},           // ESCAPE
-     {ErrorBegin},           // COMMA
-     {ErrorBegin},           // COLON
-     {ValueEnd},             // WHITE_SPACE
-     nl_tokens({ValueEnd}),  // LINE_BREAK
-     {},                     // OTHER
+    {                                      /*ROOT*/
+     {ErrorBegin},                         // OPENING_BRACE
+     {ErrorBegin},                         // OPENING_BRACKET
+     {ErrorBegin},                         // CLOSING_BRACE
+     {ErrorBegin},                         // CLOSING_BRACKET
+     {ErrorBegin},                         // QUOTE
+     {ErrorBegin},                         // ESCAPE
+     {ErrorBegin},                         // COMMA
+     {ErrorBegin},                         // COLON
+     {ValueEnd},                           // WHITE_SPACE
+     nl_tokens({ValueEnd}, {ErrorBegin}),  // LINE_BREAK
+     {},                                   // OTHER
      /*LIST*/
-     {ErrorBegin},           // OPENING_BRACE
-     {ErrorBegin},           // OPENING_BRACKET
-     {ErrorBegin},           // CLOSING_BRACE
-     {ValueEnd, ListEnd},    // CLOSING_BRACKET
-     {ErrorBegin},           // QUOTE
-     {ErrorBegin},           // ESCAPE
-     {ValueEnd},             // COMMA
-     {ErrorBegin},           // COLON
-     {ValueEnd},             // WHITE_SPACE
-     nl_tokens({ValueEnd}),  // LINE_BREAK
-     {},                     // OTHER
+     {ErrorBegin},                         // OPENING_BRACE
+     {ErrorBegin},                         // OPENING_BRACKET
+     {ErrorBegin},                         // CLOSING_BRACE
+     {ValueEnd, ListEnd},                  // CLOSING_BRACKET
+     {ErrorBegin},                         // QUOTE
+     {ErrorBegin},                         // ESCAPE
+     {ValueEnd},                           // COMMA
+     {ErrorBegin},                         // COLON
+     {ValueEnd},                           // WHITE_SPACE
+     nl_tokens({ValueEnd}, {ErrorBegin}),  // LINE_BREAK
+     {},                                   // OTHER
      /*STRUCT*/
      {ErrorBegin},                            // OPENING_BRACE
      {ErrorBegin},                            // OPENING_BRACKET
@@ -832,108 +957,108 @@ auto get_translation_table(bool include_line_delimiter)
      {ValueEnd, StructMemberEnd},             // COMMA
      {ErrorBegin},                            // COLON
      {ValueEnd},                              // WHITE_SPACE
-     nl_tokens({ValueEnd}),                   // LINE_BREAK
+     nl_tokens({ValueEnd}, {ErrorBegin}),     // LINE_BREAK
      {}}};                                    // OTHER
 
-  pda_tlt[static_cast<StateT>(pda_state_t::PD_STR)] = {{                /*ROOT*/
-                                                        {},             // OPENING_BRACE
-                                                        {},             // OPENING_BRACKET
-                                                        {},             // CLOSING_BRACE
-                                                        {},             // CLOSING_BRACKET
-                                                        {StringEnd},    // QUOTE
-                                                        {},             // ESCAPE
-                                                        {},             // COMMA
-                                                        {},             // COLON
-                                                        {},             // WHITE_SPACE
-                                                        nl_tokens({}),  // LINE_BREAK
-                                                        {},             // OTHER
+  pda_tlt[static_cast<StateT>(pda_state_t::PD_STR)] = {{              /*ROOT*/
+                                                        {},           // OPENING_BRACE
+                                                        {},           // OPENING_BRACKET
+                                                        {},           // CLOSING_BRACE
+                                                        {},           // CLOSING_BRACKET
+                                                        {StringEnd},  // QUOTE
+                                                        {},           // ESCAPE
+                                                        {},           // COMMA
+                                                        {},           // COLON
+                                                        {},           // WHITE_SPACE
+                                                        nl_tokens({}, {ErrorBegin}),  // LINE_BREAK
+                                                        {},                           // OTHER
                                                         /*LIST*/
-                                                        {},             // OPENING_BRACE
-                                                        {},             // OPENING_BRACKET
-                                                        {},             // CLOSING_BRACE
-                                                        {},             // CLOSING_BRACKET
-                                                        {StringEnd},    // QUOTE
-                                                        {},             // ESCAPE
-                                                        {},             // COMMA
-                                                        {},             // COLON
-                                                        {},             // WHITE_SPACE
-                                                        nl_tokens({}),  // LINE_BREAK
-                                                        {},             // OTHER
+                                                        {},           // OPENING_BRACE
+                                                        {},           // OPENING_BRACKET
+                                                        {},           // CLOSING_BRACE
+                                                        {},           // CLOSING_BRACKET
+                                                        {StringEnd},  // QUOTE
+                                                        {},           // ESCAPE
+                                                        {},           // COMMA
+                                                        {},           // COLON
+                                                        {},           // WHITE_SPACE
+                                                        nl_tokens({}, {ErrorBegin}),  // LINE_BREAK
+                                                        {},                           // OTHER
                                                         /*STRUCT*/
-                                                        {},             // OPENING_BRACE
-                                                        {},             // OPENING_BRACKET
-                                                        {},             // CLOSING_BRACE
-                                                        {},             // CLOSING_BRACKET
-                                                        {StringEnd},    // QUOTE
-                                                        {},             // ESCAPE
-                                                        {},             // COMMA
-                                                        {},             // COLON
-                                                        {},             // WHITE_SPACE
-                                                        nl_tokens({}),  // LINE_BREAK
-                                                        {}}};           // OTHER
-
-  pda_tlt[static_cast<StateT>(pda_state_t::PD_SCE)] = {{                /*ROOT*/
-                                                        {},             // OPENING_BRACE
-                                                        {},             // OPENING_BRACKET
-                                                        {},             // CLOSING_BRACE
-                                                        {},             // CLOSING_BRACKET
-                                                        {},             // QUOTE
-                                                        {},             // ESCAPE
-                                                        {},             // COMMA
-                                                        {},             // COLON
-                                                        {},             // WHITE_SPACE
-                                                        nl_tokens({}),  // LINE_BREAK
-                                                        {},             // OTHER
+                                                        {},           // OPENING_BRACE
+                                                        {},           // OPENING_BRACKET
+                                                        {},           // CLOSING_BRACE
+                                                        {},           // CLOSING_BRACKET
+                                                        {StringEnd},  // QUOTE
+                                                        {},           // ESCAPE
+                                                        {},           // COMMA
+                                                        {},           // COLON
+                                                        {},           // WHITE_SPACE
+                                                        nl_tokens({}, {ErrorBegin}),  // LINE_BREAK
+                                                        {}}};                         // OTHER
+
+  pda_tlt[static_cast<StateT>(pda_state_t::PD_SCE)] = {{     /*ROOT*/
+                                                        {},  // OPENING_BRACE
+                                                        {},  // OPENING_BRACKET
+                                                        {},  // CLOSING_BRACE
+                                                        {},  // CLOSING_BRACKET
+                                                        {},  // QUOTE
+                                                        {},  // ESCAPE
+                                                        {},  // COMMA
+                                                        {},  // COLON
+                                                        {},  // WHITE_SPACE
+                                                        nl_tokens({}, {ErrorBegin}),  // LINE_BREAK
+                                                        {},                           // OTHER
                                                         /*LIST*/
-                                                        {},             // OPENING_BRACE
-                                                        {},             // OPENING_BRACKET
-                                                        {},             // CLOSING_BRACE
-                                                        {},             // CLOSING_BRACKET
-                                                        {},             // QUOTE
-                                                        {},             // ESCAPE
-                                                        {},             // COMMA
-                                                        {},             // COLON
-                                                        {},             // WHITE_SPACE
-                                                        nl_tokens({}),  // LINE_BREAK
-                                                        {},             // OTHER
+                                                        {},  // OPENING_BRACE
+                                                        {},  // OPENING_BRACKET
+                                                        {},  // CLOSING_BRACE
+                                                        {},  // CLOSING_BRACKET
+                                                        {},  // QUOTE
+                                                        {},  // ESCAPE
+                                                        {},  // COMMA
+                                                        {},  // COLON
+                                                        {},  // WHITE_SPACE
+                                                        nl_tokens({}, {ErrorBegin}),  // LINE_BREAK
+                                                        {},                           // OTHER
                                                         /*STRUCT*/
-                                                        {},             // OPENING_BRACE
-                                                        {},             // OPENING_BRACKET
-                                                        {},             // CLOSING_BRACE
-                                                        {},             // CLOSING_BRACKET
-                                                        {},             // QUOTE
-                                                        {},             // ESCAPE
-                                                        {},             // COMMA
-                                                        {},             // COLON
-                                                        {},             // WHITE_SPACE
-                                                        nl_tokens({}),  // LINE_BREAK
-                                                        {}}};           // OTHER
+                                                        {},  // OPENING_BRACE
+                                                        {},  // OPENING_BRACKET
+                                                        {},  // CLOSING_BRACE
+                                                        {},  // CLOSING_BRACKET
+                                                        {},  // QUOTE
+                                                        {},  // ESCAPE
+                                                        {},  // COMMA
+                                                        {},  // COLON
+                                                        {},  // WHITE_SPACE
+                                                        nl_tokens({}, {ErrorBegin}),  // LINE_BREAK
+                                                        {}}};                         // OTHER
 
   pda_tlt[static_cast<StateT>(pda_state_t::PD_PVL)] = {
-    {                /*ROOT*/
-     {ErrorBegin},   // OPENING_BRACE
-     {ErrorBegin},   // OPENING_BRACKET
-     {ErrorBegin},   // CLOSING_BRACE
-     {ErrorBegin},   // CLOSING_BRACKET
-     {ErrorBegin},   // QUOTE
-     {ErrorBegin},   // ESCAPE
-     {ErrorBegin},   // COMMA
-     {ErrorBegin},   // COLON
-     {},             // WHITE_SPACE
-     nl_tokens({}),  // LINE_BREAK
-     {ErrorBegin},   // OTHER
+    {                                 /*ROOT*/
+     {alt_tokens({ErrorBegin}, {})},  // OPENING_BRACE
+     {alt_tokens({ErrorBegin}, {})},  // OPENING_BRACKET
+     {alt_tokens({ErrorBegin}, {})},  // CLOSING_BRACE
+     {alt_tokens({ErrorBegin}, {})},  // CLOSING_BRACKET
+     {alt_tokens({ErrorBegin}, {})},  // QUOTE
+     {alt_tokens({ErrorBegin}, {})},  // ESCAPE
+     {alt_tokens({ErrorBegin}, {})},  // COMMA
+     {alt_tokens({ErrorBegin}, {})},  // COLON
+     {},                              // WHITE_SPACE
+     nl_tokens({}, {}),               // LINE_BREAK
+     {alt_tokens({ErrorBegin}, {})},  // OTHER
      /*LIST*/
-     {ErrorBegin},   // OPENING_BRACE
-     {ErrorBegin},   // OPENING_BRACKET
-     {ErrorBegin},   // CLOSING_BRACE
-     {ListEnd},      // CLOSING_BRACKET
-     {ErrorBegin},   // QUOTE
-     {ErrorBegin},   // ESCAPE
-     {},             // COMMA
-     {ErrorBegin},   // COLON
-     {},             // WHITE_SPACE
-     nl_tokens({}),  // LINE_BREAK
-     {ErrorBegin},   // OTHER
+     {ErrorBegin},                 // OPENING_BRACE
+     {ErrorBegin},                 // OPENING_BRACKET
+     {ErrorBegin},                 // CLOSING_BRACE
+     {ListEnd},                    // CLOSING_BRACKET
+     {ErrorBegin},                 // QUOTE
+     {ErrorBegin},                 // ESCAPE
+     {},                           // COMMA
+     {ErrorBegin},                 // COLON
+     {},                           // WHITE_SPACE
+     nl_tokens({}, {ErrorBegin}),  // LINE_BREAK
+     {ErrorBegin},                 // OTHER
      /*STRUCT*/
      {ErrorBegin},                  // OPENING_BRACE
      {ErrorBegin},                  // OPENING_BRACKET
@@ -944,34 +1069,34 @@ auto get_translation_table(bool include_line_delimiter)
      {StructMemberEnd},             // COMMA
      {ErrorBegin},                  // COLON
      {},                            // WHITE_SPACE
-     nl_tokens({}),                 // LINE_BREAK
+     nl_tokens({}, {ErrorBegin}),   // LINE_BREAK
      {ErrorBegin}}};                // OTHER
 
   pda_tlt[static_cast<StateT>(pda_state_t::PD_BFN)] = {
-    {                          /*ROOT*/
-     {ErrorBegin},             // OPENING_BRACE
-     {ErrorBegin},             // OPENING_BRACKET
-     {ErrorBegin},             // CLOSING_BRACE
-     {ErrorBegin},             // CLOSING_BRACKET
-     {ErrorBegin},             // QUOTE
-     {ErrorBegin},             // ESCAPE
-     {ErrorBegin},             // COMMA
-     {ErrorBegin},             // COLON
-     {ErrorBegin},             // WHITE_SPACE
-     nl_tokens({ErrorBegin}),  // LINE_BREAK
-     {ErrorBegin},             // OTHER
+    {                                        /*ROOT*/
+     {ErrorBegin},                           // OPENING_BRACE
+     {ErrorBegin},                           // OPENING_BRACKET
+     {ErrorBegin},                           // CLOSING_BRACE
+     {ErrorBegin},                           // CLOSING_BRACKET
+     {ErrorBegin},                           // QUOTE
+     {ErrorBegin},                           // ESCAPE
+     {ErrorBegin},                           // COMMA
+     {ErrorBegin},                           // COLON
+     {ErrorBegin},                           // WHITE_SPACE
+     nl_tokens({ErrorBegin}, {ErrorBegin}),  // LINE_BREAK
+     {ErrorBegin},                           // OTHER
      /*LIST*/
-     {ErrorBegin},             // OPENING_BRACE
-     {ErrorBegin},             // OPENING_BRACKET
-     {ErrorBegin},             // CLOSING_BRACE
-     {ErrorBegin},             // CLOSING_BRACKET
-     {ErrorBegin},             // QUOTE
-     {ErrorBegin},             // ESCAPE
-     {ErrorBegin},             // COMMA
-     {ErrorBegin},             // COLON
-     {ErrorBegin},             // WHITE_SPACE
-     nl_tokens({ErrorBegin}),  // LINE_BREAK
-     {ErrorBegin},             // OTHER
+     {ErrorBegin},                           // OPENING_BRACE
+     {ErrorBegin},                           // OPENING_BRACKET
+     {ErrorBegin},                           // CLOSING_BRACE
+     {ErrorBegin},                           // CLOSING_BRACKET
+     {ErrorBegin},                           // QUOTE
+     {ErrorBegin},                           // ESCAPE
+     {ErrorBegin},                           // COMMA
+     {ErrorBegin},                           // COLON
+     {ErrorBegin},                           // WHITE_SPACE
+     nl_tokens({ErrorBegin}, {ErrorBegin}),  // LINE_BREAK
+     {ErrorBegin},                           // OTHER
      /*STRUCT*/
      {ErrorBegin},                         // OPENING_BRACE
      {ErrorBegin},                         // OPENING_BRACKET
@@ -982,156 +1107,159 @@ auto get_translation_table(bool include_line_delimiter)
      {ErrorBegin},                         // COMMA
      {ErrorBegin},                         // COLON
      {},                                   // WHITE_SPACE
-     nl_tokens({}),                        // LINE_BREAK
+     nl_tokens({}, {ErrorBegin}),          // LINE_BREAK
      {ErrorBegin}}};                       // OTHER
 
-  pda_tlt[static_cast<StateT>(pda_state_t::PD_FLN)] = {{                          /*ROOT*/
-                                                        {ErrorBegin},             // OPENING_BRACE
-                                                        {ErrorBegin},             // OPENING_BRACKET
-                                                        {ErrorBegin},             // CLOSING_BRACE
-                                                        {ErrorBegin},             // CLOSING_BRACKET
-                                                        {ErrorBegin},             // QUOTE
-                                                        {ErrorBegin},             // ESCAPE
-                                                        {ErrorBegin},             // COMMA
-                                                        {ErrorBegin},             // COLON
-                                                        {ErrorBegin},             // WHITE_SPACE
-                                                        nl_tokens({ErrorBegin}),  // LINE_BREAK
-                                                        {ErrorBegin},             // OTHER
-                                                        /*LIST*/
-                                                        {ErrorBegin},             // OPENING_BRACE
-                                                        {ErrorBegin},             // OPENING_BRACKET
-                                                        {ErrorBegin},             // CLOSING_BRACE
-                                                        {ErrorBegin},             // CLOSING_BRACKET
-                                                        {ErrorBegin},             // QUOTE
-                                                        {ErrorBegin},             // ESCAPE
-                                                        {ErrorBegin},             // COMMA
-                                                        {ErrorBegin},             // COLON
-                                                        {ErrorBegin},             // WHITE_SPACE
-                                                        nl_tokens({ErrorBegin}),  // LINE_BREAK
-                                                        {ErrorBegin},             // OTHER
-                                                        /*STRUCT*/
-                                                        {},              // OPENING_BRACE
-                                                        {},              // OPENING_BRACKET
-                                                        {},              // CLOSING_BRACE
-                                                        {},              // CLOSING_BRACKET
-                                                        {FieldNameEnd},  // QUOTE
-                                                        {},              // ESCAPE
-                                                        {},              // COMMA
-                                                        {},              // COLON
-                                                        {},              // WHITE_SPACE
-                                                        nl_tokens({}),   // LINE_BREAK
-                                                        {}}};            // OTHER
-
-  pda_tlt[static_cast<StateT>(pda_state_t::PD_FNE)] = {{                          /*ROOT*/
-                                                        {ErrorBegin},             // OPENING_BRACE
-                                                        {ErrorBegin},             // OPENING_BRACKET
-                                                        {ErrorBegin},             // CLOSING_BRACE
-                                                        {ErrorBegin},             // CLOSING_BRACKET
-                                                        {ErrorBegin},             // QUOTE
-                                                        {ErrorBegin},             // ESCAPE
-                                                        {ErrorBegin},             // COMMA
-                                                        {ErrorBegin},             // COLON
-                                                        {ErrorBegin},             // WHITE_SPACE
-                                                        nl_tokens({ErrorBegin}),  // LINE_BREAK
-                                                        {ErrorBegin},             // OTHER
-                                                        /*LIST*/
-                                                        {ErrorBegin},             // OPENING_BRACE
-                                                        {ErrorBegin},             // OPENING_BRACKET
-                                                        {ErrorBegin},             // CLOSING_BRACE
-                                                        {ErrorBegin},             // CLOSING_BRACKET
-                                                        {ErrorBegin},             // QUOTE
-                                                        {ErrorBegin},             // ESCAPE
-                                                        {ErrorBegin},             // COMMA
-                                                        {ErrorBegin},             // COLON
-                                                        {ErrorBegin},             // WHITE_SPACE
-                                                        nl_tokens({ErrorBegin}),  // LINE_BREAK
-                                                        {ErrorBegin},             // OTHER
-                                                        /*STRUCT*/
-                                                        {},             // OPENING_BRACE
-                                                        {},             // OPENING_BRACKET
-                                                        {},             // CLOSING_BRACE
-                                                        {},             // CLOSING_BRACKET
-                                                        {},             // QUOTE
-                                                        {},             // ESCAPE
-                                                        {},             // COMMA
-                                                        {},             // COLON
-                                                        {},             // WHITE_SPACE
-                                                        nl_tokens({}),  // LINE_BREAK
-                                                        {}}};           // OTHER
-
-  pda_tlt[static_cast<StateT>(pda_state_t::PD_PFN)] = {{                          /*ROOT*/
-                                                        {ErrorBegin},             // OPENING_BRACE
-                                                        {ErrorBegin},             // OPENING_BRACKET
-                                                        {ErrorBegin},             // CLOSING_BRACE
-                                                        {ErrorBegin},             // CLOSING_BRACKET
-                                                        {ErrorBegin},             // QUOTE
-                                                        {ErrorBegin},             // ESCAPE
-                                                        {ErrorBegin},             // COMMA
-                                                        {ErrorBegin},             // COLON
-                                                        {ErrorBegin},             // WHITE_SPACE
-                                                        nl_tokens({ErrorBegin}),  // LINE_BREAK
-                                                        {ErrorBegin},             // OTHER
-                                                        /*LIST*/
-                                                        {ErrorBegin},             // OPENING_BRACE
-                                                        {ErrorBegin},             // OPENING_BRACKET
-                                                        {ErrorBegin},             // CLOSING_BRACE
-                                                        {ErrorBegin},             // CLOSING_BRACKET
-                                                        {ErrorBegin},             // QUOTE
-                                                        {ErrorBegin},             // ESCAPE
-                                                        {ErrorBegin},             // COMMA
-                                                        {ErrorBegin},             // COLON
-                                                        {ErrorBegin},             // WHITE_SPACE
-                                                        nl_tokens({ErrorBegin}),  // LINE_BREAK
-                                                        {ErrorBegin},             // OTHER
-                                                        /*STRUCT*/
-                                                        {ErrorBegin},    // OPENING_BRACE
-                                                        {ErrorBegin},    // OPENING_BRACKET
-                                                        {ErrorBegin},    // CLOSING_BRACE
-                                                        {ErrorBegin},    // CLOSING_BRACKET
-                                                        {ErrorBegin},    // QUOTE
-                                                        {ErrorBegin},    // ESCAPE
-                                                        {ErrorBegin},    // COMMA
-                                                        {},              // COLON
-                                                        {},              // WHITE_SPACE
-                                                        nl_tokens({}),   // LINE_BREAK
-                                                        {ErrorBegin}}};  // OTHER
-
-  pda_tlt[static_cast<StateT>(pda_state_t::PD_ERR)] = {{                /*ROOT*/
-                                                        {},             // OPENING_BRACE
-                                                        {},             // OPENING_BRACKET
-                                                        {},             // CLOSING_BRACE
-                                                        {},             // CLOSING_BRACKET
-                                                        {},             // QUOTE
-                                                        {},             // ESCAPE
-                                                        {},             // COMMA
-                                                        {},             // COLON
-                                                        {},             // WHITE_SPACE
-                                                        nl_tokens({}),  // LINE_BREAK
-                                                        {},             // OTHER
+  pda_tlt[static_cast<StateT>(pda_state_t::PD_FLN)] = {
+    {                                        /*ROOT*/
+     {ErrorBegin},                           // OPENING_BRACE
+     {ErrorBegin},                           // OPENING_BRACKET
+     {ErrorBegin},                           // CLOSING_BRACE
+     {ErrorBegin},                           // CLOSING_BRACKET
+     {ErrorBegin},                           // QUOTE
+     {ErrorBegin},                           // ESCAPE
+     {ErrorBegin},                           // COMMA
+     {ErrorBegin},                           // COLON
+     {ErrorBegin},                           // WHITE_SPACE
+     nl_tokens({ErrorBegin}, {ErrorBegin}),  // LINE_BREAK
+     {ErrorBegin},                           // OTHER
+     /*LIST*/
+     {ErrorBegin},                           // OPENING_BRACE
+     {ErrorBegin},                           // OPENING_BRACKET
+     {ErrorBegin},                           // CLOSING_BRACE
+     {ErrorBegin},                           // CLOSING_BRACKET
+     {ErrorBegin},                           // QUOTE
+     {ErrorBegin},                           // ESCAPE
+     {ErrorBegin},                           // COMMA
+     {ErrorBegin},                           // COLON
+     {ErrorBegin},                           // WHITE_SPACE
+     nl_tokens({ErrorBegin}, {ErrorBegin}),  // LINE_BREAK
+     {ErrorBegin},                           // OTHER
+     /*STRUCT*/
+     {},                           // OPENING_BRACE
+     {},                           // OPENING_BRACKET
+     {},                           // CLOSING_BRACE
+     {},                           // CLOSING_BRACKET
+     {FieldNameEnd},               // QUOTE
+     {},                           // ESCAPE
+     {},                           // COMMA
+     {},                           // COLON
+     {},                           // WHITE_SPACE
+     nl_tokens({}, {ErrorBegin}),  // LINE_BREAK
+     {}}};                         // OTHER
+
+  pda_tlt[static_cast<StateT>(pda_state_t::PD_FNE)] = {
+    {                                        /*ROOT*/
+     {ErrorBegin},                           // OPENING_BRACE
+     {ErrorBegin},                           // OPENING_BRACKET
+     {ErrorBegin},                           // CLOSING_BRACE
+     {ErrorBegin},                           // CLOSING_BRACKET
+     {ErrorBegin},                           // QUOTE
+     {ErrorBegin},                           // ESCAPE
+     {ErrorBegin},                           // COMMA
+     {ErrorBegin},                           // COLON
+     {ErrorBegin},                           // WHITE_SPACE
+     nl_tokens({ErrorBegin}, {ErrorBegin}),  // LINE_BREAK
+     {ErrorBegin},                           // OTHER
+     /*LIST*/
+     {ErrorBegin},                           // OPENING_BRACE
+     {ErrorBegin},                           // OPENING_BRACKET
+     {ErrorBegin},                           // CLOSING_BRACE
+     {ErrorBegin},                           // CLOSING_BRACKET
+     {ErrorBegin},                           // QUOTE
+     {ErrorBegin},                           // ESCAPE
+     {ErrorBegin},                           // COMMA
+     {ErrorBegin},                           // COLON
+     {ErrorBegin},                           // WHITE_SPACE
+     nl_tokens({ErrorBegin}, {ErrorBegin}),  // LINE_BREAK
+     {ErrorBegin},                           // OTHER
+     /*STRUCT*/
+     {},                           // OPENING_BRACE
+     {},                           // OPENING_BRACKET
+     {},                           // CLOSING_BRACE
+     {},                           // CLOSING_BRACKET
+     {},                           // QUOTE
+     {},                           // ESCAPE
+     {},                           // COMMA
+     {},                           // COLON
+     {},                           // WHITE_SPACE
+     nl_tokens({}, {ErrorBegin}),  // LINE_BREAK
+     {}}};                         // OTHER
+
+  pda_tlt[static_cast<StateT>(pda_state_t::PD_PFN)] = {
+    {                                        /*ROOT*/
+     {ErrorBegin},                           // OPENING_BRACE
+     {ErrorBegin},                           // OPENING_BRACKET
+     {ErrorBegin},                           // CLOSING_BRACE
+     {ErrorBegin},                           // CLOSING_BRACKET
+     {ErrorBegin},                           // QUOTE
+     {ErrorBegin},                           // ESCAPE
+     {ErrorBegin},                           // COMMA
+     {ErrorBegin},                           // COLON
+     {ErrorBegin},                           // WHITE_SPACE
+     nl_tokens({ErrorBegin}, {ErrorBegin}),  // LINE_BREAK
+     {ErrorBegin},                           // OTHER
+     /*LIST*/
+     {ErrorBegin},                           // OPENING_BRACE
+     {ErrorBegin},                           // OPENING_BRACKET
+     {ErrorBegin},                           // CLOSING_BRACE
+     {ErrorBegin},                           // CLOSING_BRACKET
+     {ErrorBegin},                           // QUOTE
+     {ErrorBegin},                           // ESCAPE
+     {ErrorBegin},                           // COMMA
+     {ErrorBegin},                           // COLON
+     {ErrorBegin},                           // WHITE_SPACE
+     nl_tokens({ErrorBegin}, {ErrorBegin}),  // LINE_BREAK
+     {ErrorBegin},                           // OTHER
+     /*STRUCT*/
+     {ErrorBegin},                 // OPENING_BRACE
+     {ErrorBegin},                 // OPENING_BRACKET
+     {ErrorBegin},                 // CLOSING_BRACE
+     {ErrorBegin},                 // CLOSING_BRACKET
+     {ErrorBegin},                 // QUOTE
+     {ErrorBegin},                 // ESCAPE
+     {ErrorBegin},                 // COMMA
+     {},                           // COLON
+     {},                           // WHITE_SPACE
+     nl_tokens({}, {ErrorBegin}),  // LINE_BREAK
+     {ErrorBegin}}};               // OTHER
+
+  pda_tlt[static_cast<StateT>(pda_state_t::PD_ERR)] = {{                    /*ROOT*/
+                                                        {},                 // OPENING_BRACE
+                                                        {},                 // OPENING_BRACKET
+                                                        {},                 // CLOSING_BRACE
+                                                        {},                 // CLOSING_BRACKET
+                                                        {},                 // QUOTE
+                                                        {},                 // ESCAPE
+                                                        {},                 // COMMA
+                                                        {},                 // COLON
+                                                        {},                 // WHITE_SPACE
+                                                        nl_tokens({}, {}),  // LINE_BREAK
+                                                        {},                 // OTHER
                                                         /*LIST*/
-                                                        {},             // OPENING_BRACE
-                                                        {},             // OPENING_BRACKET
-                                                        {},             // CLOSING_BRACE
-                                                        {},             // CLOSING_BRACKET
-                                                        {},             // QUOTE
-                                                        {},             // ESCAPE
-                                                        {},             // COMMA
-                                                        {},             // COLON
-                                                        {},             // WHITE_SPACE
-                                                        nl_tokens({}),  // LINE_BREAK
-                                                        {},             // OTHER
+                                                        {},                 // OPENING_BRACE
+                                                        {},                 // OPENING_BRACKET
+                                                        {},                 // CLOSING_BRACE
+                                                        {},                 // CLOSING_BRACKET
+                                                        {},                 // QUOTE
+                                                        {},                 // ESCAPE
+                                                        {},                 // COMMA
+                                                        {},                 // COLON
+                                                        {},                 // WHITE_SPACE
+                                                        nl_tokens({}, {}),  // LINE_BREAK
+                                                        {},                 // OTHER
                                                         /*STRUCT*/
-                                                        {},             // OPENING_BRACE
-                                                        {},             // OPENING_BRACKET
-                                                        {},             // CLOSING_BRACE
-                                                        {},             // CLOSING_BRACKET
-                                                        {},             // QUOTE
-                                                        {},             // ESCAPE
-                                                        {},             // COMMA
-                                                        {},             // COLON
-                                                        {},             // WHITE_SPACE
-                                                        nl_tokens({}),  // LINE_BREAK
-                                                        {}}};           // OTHER
+                                                        {},                 // OPENING_BRACE
+                                                        {},                 // OPENING_BRACKET
+                                                        {},                 // CLOSING_BRACE
+                                                        {},                 // CLOSING_BRACKET
+                                                        {},                 // QUOTE
+                                                        {},                 // ESCAPE
+                                                        {},                 // COMMA
+                                                        {},                 // COLON
+                                                        {},                 // WHITE_SPACE
+                                                        nl_tokens({}, {}),  // LINE_BREAK
+                                                        {}}};               // OTHER
   return pda_tlt;
 }
 
@@ -1295,14 +1423,19 @@ void get_stack_context(device_span<SymbolT const> json_in,
   constexpr auto max_translation_table_size =
     to_stack_op::NUM_SYMBOL_GROUPS * to_stack_op::TT_NUM_STATES;
 
-  // Translation table specialized on the choice of whether to reset on newlines outside of strings
+  // Transition table specialized on the choice of whether to reset on newlines
+  const auto transition_table = (stack_behavior == stack_behavior_t::ResetOnDelimiter)
+                                  ? to_stack_op::resetting_transition_table
+                                  : to_stack_op::transition_table;
+
+  // Translation table specialized on the choice of whether to reset on newlines
   const auto translation_table = (stack_behavior == stack_behavior_t::ResetOnDelimiter)
                                    ? to_stack_op::resetting_translation_table
                                    : to_stack_op::translation_table;
 
   auto json_to_stack_ops_fst = fst::detail::make_fst(
     fst::detail::make_symbol_group_lut(to_stack_op::symbol_groups),
-    fst::detail::make_transition_table(to_stack_op::transition_table),
+    fst::detail::make_transition_table(transition_table),
     fst::detail::make_translation_table<max_translation_table_size>(translation_table),
     stream);
 
@@ -1321,7 +1454,7 @@ void get_stack_context(device_span<SymbolT const> json_in,
 
   // Stack operations with indices are converted to top of the stack for each character in the input
   if (stack_behavior == stack_behavior_t::ResetOnDelimiter) {
-    fst::sparse_stack_op_to_top_of_stack<StackLevelT>(
+    fst::sparse_stack_op_to_top_of_stack<fst::stack_op_support::WITH_RESET_SUPPORT, StackLevelT>(
       stack_ops.data(),
       device_span<SymbolOffsetT>{stack_op_indices.data(), num_stack_ops},
       JSONWithRecoveryToStackOp{},
@@ -1331,7 +1464,7 @@ void get_stack_context(device_span<SymbolT const> json_in,
       json_in.size(),
       stream);
   } else {
-    fst::sparse_stack_op_to_top_of_stack<StackLevelT>(
+    fst::sparse_stack_op_to_top_of_stack<fst::stack_op_support::NO_RESET_SUPPORT, StackLevelT>(
       stack_ops.data(),
       device_span<SymbolOffsetT>{stack_op_indices.data(), num_stack_ops},
       JSONToStackOp{},
@@ -1433,6 +1566,26 @@ std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> ge
   // character.
   auto zip_in = thrust::make_zip_iterator(json_in.data(), stack_symbols.data());
 
+  // Spark, as the main stakeholder in the `recover_from_error` option, has the specific need to
+  // ignore any characters that follow the first value on each JSON line. This is an FST that
+  // fixes the stack context for those excess characters. That is, that all those excess characters
+  // will be interpreted in the root stack context
+  if (recover_from_error) {
+    auto fix_stack_of_excess_chars = fst::detail::make_fst(
+      fst::detail::make_symbol_group_lookup_op(
+        fix_stack_of_excess_chars::SymbolPairToSymbolGroupId{}),
+      fst::detail::make_transition_table(fix_stack_of_excess_chars::transition_table),
+      fst::detail::make_translation_functor(fix_stack_of_excess_chars::TransduceInputOp{}),
+      stream);
+    fix_stack_of_excess_chars.Transduce(zip_in,
+                                        static_cast<SymbolOffsetT>(json_in.size()),
+                                        stack_symbols.data(),
+                                        thrust::make_discard_iterator(),
+                                        thrust::make_discard_iterator(),
+                                        fix_stack_of_excess_chars::start_state,
+                                        stream);
+  }
+
   constexpr auto max_translation_table_size =
     tokenizer_pda::NUM_PDA_SGIDS *
     static_cast<tokenizer_pda::StateT>(tokenizer_pda::pda_state_t::PD_NUM_STATES);
diff --git a/cpp/src/io/json/write_json.cu b/cpp/src/io/json/write_json.cu
index 2d363c51fce..c211d17f13a 100644
--- a/cpp/src/io/json/write_json.cu
+++ b/cpp/src/io/json/write_json.cu
@@ -504,6 +504,12 @@ struct column_to_strings_fn {
   {
   }
 
+  ~column_to_strings_fn()                                      = default;
+  column_to_strings_fn(column_to_strings_fn const&)            = delete;
+  column_to_strings_fn& operator=(column_to_strings_fn const&) = delete;
+  column_to_strings_fn(column_to_strings_fn&&)                 = delete;
+  column_to_strings_fn& operator=(column_to_strings_fn&&)      = delete;
+
   // unsupported type of column:
   template <typename column_type>
   std::enable_if_t<is_not_handled<column_type>(), std::unique_ptr<column>> operator()(
@@ -614,17 +620,18 @@ struct column_to_strings_fn {
 
     auto child_string_with_null = [&]() {
       if (child_view.type().id() == type_id::STRUCT) {
-        return (*this).template operator()<cudf::struct_view>(
-          child_view,
-          children_names.size() > child_index ? children_names[child_index].children
-                                              : std::vector<column_name_info>{});
-      } else if (child_view.type().id() == type_id::LIST) {
-        return (*this).template operator()<cudf::list_view>(child_view,
+        return this->template operator()<cudf::struct_view>(child_view,
                                                             children_names.size() > child_index
                                                               ? children_names[child_index].children
                                                               : std::vector<column_name_info>{});
+      } else if (child_view.type().id() == type_id::LIST) {
+        return this->template operator()<cudf::list_view>(child_view,
+                                                          children_names.size() > child_index
+                                                            ? children_names[child_index].children
+                                                            : std::vector<column_name_info>{});
       } else {
-        return cudf::type_dispatcher(child_view.type(), *this, child_view);
+        return cudf::type_dispatcher<cudf::id_to_type_impl, column_to_strings_fn const&>(
+          child_view.type(), *this, child_view);
       }
     };
     auto new_offsets = cudf::lists::detail::get_normalized_offsets(
@@ -679,27 +686,29 @@ struct column_to_strings_fn {
     //
     auto i_col_begin =
       thrust::make_zip_iterator(thrust::counting_iterator<size_t>(0), column_begin);
-    std::transform(i_col_begin,
-                   i_col_begin + num_columns,
-                   std::back_inserter(str_column_vec),
-                   [this, &children_names](auto const& i_current_col) {
-                     auto const i            = thrust::get<0>(i_current_col);
-                     auto const& current_col = thrust::get<1>(i_current_col);
-                     // Struct needs children's column names
-                     if (current_col.type().id() == type_id::STRUCT) {
-                       return (*this).template operator()<cudf::struct_view>(
-                         current_col,
-                         children_names.size() > i ? children_names[i].children
-                                                   : std::vector<column_name_info>{});
-                     } else if (current_col.type().id() == type_id::LIST) {
-                       return (*this).template operator()<cudf::list_view>(
-                         current_col,
-                         children_names.size() > i ? children_names[i].children
-                                                   : std::vector<column_name_info>{});
-                     } else {
-                       return cudf::type_dispatcher(current_col.type(), *this, current_col);
-                     }
-                   });
+    std::transform(
+      i_col_begin,
+      i_col_begin + num_columns,
+      std::back_inserter(str_column_vec),
+      [this, &children_names](auto const& i_current_col) {
+        auto const i            = thrust::get<0>(i_current_col);
+        auto const& current_col = thrust::get<1>(i_current_col);
+        // Struct needs children's column names
+        if (current_col.type().id() == type_id::STRUCT) {
+          return this->template operator()<cudf::struct_view>(current_col,
+                                                              children_names.size() > i
+                                                                ? children_names[i].children
+                                                                : std::vector<column_name_info>{});
+        } else if (current_col.type().id() == type_id::LIST) {
+          return this->template operator()<cudf::list_view>(current_col,
+                                                            children_names.size() > i
+                                                              ? children_names[i].children
+                                                              : std::vector<column_name_info>{});
+        } else {
+          return cudf::type_dispatcher<cudf::id_to_type_impl, column_to_strings_fn const&>(
+            current_col.type(), *this, current_col);
+        }
+      });
 
     // create string table view from str_column_vec:
     //
diff --git a/cpp/src/io/orc/orc.cpp b/cpp/src/io/orc/orc.cpp
index bc399b75ef9..ee5fa4e8b5a 100644
--- a/cpp/src/io/orc/orc.cpp
+++ b/cpp/src/io/orc/orc.cpp
@@ -182,6 +182,19 @@ void ProtobufReader::read(timestamp_statistics& s, size_t maxlen)
                        field_reader(5, s.minimum_nanos),
                        field_reader(6, s.maximum_nanos));
   function_builder(s, maxlen, op);
+
+  // Adjust nanoseconds because they are encoded as (value + 1)
+  // Range [1, 1000'000] is translated here to [0, 999'999]
+  if (s.minimum_nanos.has_value()) {
+    auto& min_nanos = s.minimum_nanos.value();
+    CUDF_EXPECTS(min_nanos >= 1 and min_nanos <= 1000'000, "Invalid minimum nanoseconds");
+    --min_nanos;
+  }
+  if (s.maximum_nanos.has_value()) {
+    auto& max_nanos = s.maximum_nanos.value();
+    CUDF_EXPECTS(max_nanos >= 1 and max_nanos <= 1000'000, "Invalid maximum nanoseconds");
+    --max_nanos;
+  }
 }
 
 void ProtobufReader::read(column_statistics& s, size_t maxlen)
diff --git a/cpp/src/io/orc/orc.hpp b/cpp/src/io/orc/orc.hpp
index 6f65e384d2d..783ed4206b6 100644
--- a/cpp/src/io/orc/orc.hpp
+++ b/cpp/src/io/orc/orc.hpp
@@ -41,6 +41,12 @@ static constexpr uint32_t block_header_size = 3;
 // Seconds from January 1st, 1970 to January 1st, 2015
 static constexpr int64_t orc_utc_epoch = 1420070400;
 
+// Used for the nanosecond remainder in timestamp statistics when the actual nanoseconds of min/max
+// are not included. As the timestamp statistics are stored as milliseconds + nanosecond remainder,
+// the maximum nanosecond remainder is 999,999 (nanoseconds in a millisecond - 1).
+static constexpr int32_t DEFAULT_MIN_NANOS = 0;
+static constexpr int32_t DEFAULT_MAX_NANOS = 999'999;
+
 struct PostScript {
   uint64_t footerLength       = 0;     // the length of the footer section in bytes
   CompressionKind compression = NONE;  // the kind of generic compression used
diff --git a/cpp/src/io/orc/orc_gpu.hpp b/cpp/src/io/orc/orc_gpu.hpp
index dba7a9ffda5..243704b65d4 100644
--- a/cpp/src/io/orc/orc_gpu.hpp
+++ b/cpp/src/io/orc/orc_gpu.hpp
@@ -150,7 +150,8 @@ struct EncChunk {
   uint8_t dtype_len;                 // data type length
   int32_t scale;                     // scale for decimals or timestamps
 
-  uint32_t* dict_index;  // dictionary index from row index
+  uint32_t* dict_index;       // dictionary index from row index
+  uint32_t* dict_data_order;  // map from data to sorted data indices
   uint32_t* decimal_offsets;
   orc_column_device_view const* column;
 };
@@ -191,11 +192,12 @@ struct stripe_dictionary {
   size_type num_rows       = 0;      // number of rows in the stripe
 
   // output
-  device_span<uint32_t> data;     // index of elements in the column to include in the dictionary
-  device_span<uint32_t> index;    // index into the dictionary for each row in the column
-  size_type entry_count = 0;      // number of entries in the dictionary
-  size_type char_count  = 0;      // number of characters in the dictionary
-  bool is_enabled       = false;  // true if dictionary encoding is enabled for this stripe
+  device_span<uint32_t> data;        // index of elements in the column to include in the dictionary
+  device_span<uint32_t> index;       // index into the dictionary for each row in the column
+  device_span<uint32_t> data_order;  // map from data to sorted data indices
+  size_type entry_count = 0;         // number of entries in the dictionary
+  size_type char_count  = 0;         // number of characters in the dictionary
+  bool is_enabled       = false;     // true if dictionary encoding is enabled for this stripe
 };
 
 /**
@@ -424,6 +426,20 @@ void rowgroup_char_counts(device_2dspan<size_type> counts,
                           device_span<uint32_t const> str_col_indexes,
                           rmm::cuda_stream_view stream);
 
+/**
+ * @brief Converts sizes of decimal elements to offsets within the rowgroup.
+ *
+ * @note The conversion is done in-place. After the conversion, the device vectors in \p elem_sizes
+ * hold the offsets.
+ *
+ * @param rg_bounds Ranges of rows in each rowgroup [rowgroup][column]
+ * @param elem_sizes Map between column indexes and decimal element sizes
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ */
+void decimal_sizes_to_offsets(device_2dspan<rowgroup_rows const> rg_bounds,
+                              std::map<uint32_t, rmm::device_uvector<uint32_t>>& elem_sizes,
+                              rmm::cuda_stream_view stream);
+
 /**
  * @brief Launches kernels to initialize statistics collection
  *
diff --git a/cpp/src/io/orc/stats_enc.cu b/cpp/src/io/orc/stats_enc.cu
index 95f1db5bfd1..429fd5b929d 100644
--- a/cpp/src/io/orc/stats_enc.cu
+++ b/cpp/src/io/orc/stats_enc.cu
@@ -27,6 +27,10 @@ namespace cudf::io::orc::gpu {
 
 using strings::detail::fixed_point_string_size;
 
+// Nanosecond statistics should not be enabled until the spec version is set correctly in the output
+// files. See https://github.com/rapidsai/cudf/issues/14325 for more details
+constexpr bool enable_nanosecond_statistics = false;
+
 constexpr unsigned int init_threads_per_group = 32;
 constexpr unsigned int init_groups_per_block  = 4;
 constexpr unsigned int init_threads_per_block = init_threads_per_group * init_groups_per_block;
@@ -76,8 +80,8 @@ __global__ void __launch_bounds__(block_size, 1)
 {
   using block_scan = cub::BlockScan<uint32_t, block_size, cub::BLOCK_SCAN_WARP_SCANS>;
   __shared__ typename block_scan::TempStorage temp_storage;
-  volatile uint32_t stats_size = 0;
-  auto t                       = threadIdx.x;
+  uint32_t stats_size = 0;
+  auto t              = threadIdx.x;
   __syncthreads();
   for (thread_index_type start = 0; start < statistics_count; start += block_size) {
     uint32_t stats_len = 0, stats_pos;
@@ -96,8 +100,10 @@ __global__ void __launch_bounds__(block_size, 1)
           stats_len = pb_fldlen_common + pb_fld_hdrlen + 2 * (pb_fld_hdrlen + pb_fldlen_int64);
           break;
         case dtype_timestamp64:
-          stats_len = pb_fldlen_common + pb_fld_hdrlen + 4 * (pb_fld_hdrlen + pb_fldlen_int64) +
-                      2 * (pb_fld_hdrlen + pb_fldlen_int32);
+          stats_len = pb_fldlen_common + pb_fld_hdrlen + 4 * (pb_fld_hdrlen + pb_fldlen_int64);
+          if constexpr (enable_nanosecond_statistics) {
+            stats_len += 2 * (pb_fld_hdrlen + pb_fldlen_int32);
+          }
           break;
         case dtype_float32:
         case dtype_float64:
@@ -405,7 +411,8 @@ __global__ void __launch_bounds__(encode_threads_per_block)
         //  optional sint64 minimumUtc = 3; // min,max values saved as milliseconds since UNIX epoch
         //  optional sint64 maximumUtc = 4;
         //  optional int32 minimumNanos = 5; // lower 6 TS digits for min/max to achieve nanosecond
-        //  precision optional int32 maximumNanos = 6;
+        //  precision
+        // optional int32 maximumNanos = 6;
         // }
         if (s->chunk.has_minmax) {
           cur[0] = 9 * 8 + ProtofType::FIXEDLEN;
@@ -416,12 +423,22 @@ __global__ void __launch_bounds__(encode_threads_per_block)
             split_nanosecond_timestamp(s->chunk.max_value.i_val);
 
           // minimum/maximum are the same as minimumUtc/maximumUtc as we always write files in UTC
-          cur          = pb_put_int(cur, 1, min_ms);            // minimum
-          cur          = pb_put_int(cur, 2, max_ms);            // maximum
-          cur          = pb_put_int(cur, 3, min_ms);            // minimumUtc
-          cur          = pb_put_int(cur, 4, max_ms);            // maximumUtc
-          cur          = pb_put_int(cur, 5, min_ns_remainder);  // minimumNanos
-          cur          = pb_put_int(cur, 6, max_ns_remainder);  // maximumNanos
+          cur = pb_put_int(cur, 1, min_ms);  // minimum
+          cur = pb_put_int(cur, 2, max_ms);  // maximum
+          cur = pb_put_int(cur, 3, min_ms);  // minimumUtc
+          cur = pb_put_int(cur, 4, max_ms);  // maximumUtc
+
+          if constexpr (enable_nanosecond_statistics) {
+            if (min_ns_remainder != DEFAULT_MIN_NANOS) {
+              // using uint because positive values are not zigzag encoded
+              cur = pb_put_uint(cur, 5, min_ns_remainder + 1);  // minimumNanos
+            }
+            if (max_ns_remainder != DEFAULT_MAX_NANOS) {
+              // using uint because positive values are not zigzag encoded
+              cur = pb_put_uint(cur, 6, max_ns_remainder + 1);  // maximumNanos
+            }
+          }
+
           fld_start[1] = cur - (fld_start + 2);
         }
         break;
diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu
index 3edcd3d83b2..0b249bbdafe 100644
--- a/cpp/src/io/orc/stripe_data.cu
+++ b/cpp/src/io/orc/stripe_data.cu
@@ -142,9 +142,7 @@ struct orcdec_state_s {
  * @param[in] base Pointer to raw byte stream data
  * @param[in] len Stream length in bytes
  */
-static __device__ void bytestream_init(volatile orc_bytestream_s* bs,
-                                       uint8_t const* base,
-                                       uint32_t len)
+static __device__ void bytestream_init(orc_bytestream_s* bs, uint8_t const* base, uint32_t len)
 {
   uint32_t pos   = (len > 0) ? static_cast<uint32_t>(7 & reinterpret_cast<size_t>(base)) : 0;
   bs->base       = base - pos;
@@ -160,8 +158,7 @@ static __device__ void bytestream_init(volatile orc_bytestream_s* bs,
  * @param[in] bs Byte stream input
  * @param[in] bytes_consumed Number of bytes that were consumed
  */
-static __device__ void bytestream_flush_bytes(volatile orc_bytestream_s* bs,
-                                              uint32_t bytes_consumed)
+static __device__ void bytestream_flush_bytes(orc_bytestream_s* bs, uint32_t bytes_consumed)
 {
   uint32_t pos     = bs->pos;
   uint32_t len     = bs->len;
@@ -197,7 +194,7 @@ static __device__ void bytestream_fill(orc_bytestream_s* bs, int t)
  * @param[in] pos Position in byte stream
  * @return byte
  */
-inline __device__ uint8_t bytestream_readbyte(volatile orc_bytestream_s* bs, int pos)
+inline __device__ uint8_t bytestream_readbyte(orc_bytestream_s* bs, int pos)
 {
   return bs->buf.u8[pos & (bytestream_buffer_size - 1)];
 }
@@ -209,7 +206,7 @@ inline __device__ uint8_t bytestream_readbyte(volatile orc_bytestream_s* bs, int
  * @param[in] pos Position in byte stream
  * @result bits
  */
-inline __device__ uint32_t bytestream_readu32(volatile orc_bytestream_s* bs, int pos)
+inline __device__ uint32_t bytestream_readu32(orc_bytestream_s* bs, int pos)
 {
   uint32_t a = bs->buf.u32[(pos & (bytestream_buffer_size - 1)) >> 2];
   uint32_t b = bs->buf.u32[((pos + 4) & (bytestream_buffer_size - 1)) >> 2];
@@ -224,7 +221,7 @@ inline __device__ uint32_t bytestream_readu32(volatile orc_bytestream_s* bs, int
  * @param[in] numbits number of bits
  * @return bits
  */
-inline __device__ uint64_t bytestream_readu64(volatile orc_bytestream_s* bs, int pos)
+inline __device__ uint64_t bytestream_readu64(orc_bytestream_s* bs, int pos)
 {
   uint32_t a    = bs->buf.u32[(pos & (bytestream_buffer_size - 1)) >> 2];
   uint32_t b    = bs->buf.u32[((pos + 4) & (bytestream_buffer_size - 1)) >> 2];
@@ -245,9 +242,7 @@ inline __device__ uint64_t bytestream_readu64(volatile orc_bytestream_s* bs, int
  * @param[in] numbits number of bits
  * @return decoded value
  */
-inline __device__ uint32_t bytestream_readbits(volatile orc_bytestream_s* bs,
-                                               int bitpos,
-                                               uint32_t numbits)
+inline __device__ uint32_t bytestream_readbits(orc_bytestream_s* bs, int bitpos, uint32_t numbits)
 {
   int idx    = bitpos >> 5;
   uint32_t a = __byte_perm(bs->buf.u32[(idx + 0) & bytestream_buffer_mask], 0, 0x0123);
@@ -263,9 +258,7 @@ inline __device__ uint32_t bytestream_readbits(volatile orc_bytestream_s* bs,
  * @param[in] numbits number of bits
  * @return decoded value
  */
-inline __device__ uint64_t bytestream_readbits64(volatile orc_bytestream_s* bs,
-                                                 int bitpos,
-                                                 uint32_t numbits)
+inline __device__ uint64_t bytestream_readbits64(orc_bytestream_s* bs, int bitpos, uint32_t numbits)
 {
   int idx       = bitpos >> 5;
   uint32_t a    = __byte_perm(bs->buf.u32[(idx + 0) & bytestream_buffer_mask], 0, 0x0123);
@@ -288,7 +281,7 @@ inline __device__ uint64_t bytestream_readbits64(volatile orc_bytestream_s* bs,
  * @param[in] numbits number of bits
  * @param[out] result decoded value
  */
-inline __device__ void bytestream_readbe(volatile orc_bytestream_s* bs,
+inline __device__ void bytestream_readbe(orc_bytestream_s* bs,
                                          int bitpos,
                                          uint32_t numbits,
                                          uint32_t& result)
@@ -304,7 +297,7 @@ inline __device__ void bytestream_readbe(volatile orc_bytestream_s* bs,
  * @param[in] numbits number of bits
  * @param[out] result decoded value
  */
-inline __device__ void bytestream_readbe(volatile orc_bytestream_s* bs,
+inline __device__ void bytestream_readbe(orc_bytestream_s* bs,
                                          int bitpos,
                                          uint32_t numbits,
                                          int32_t& result)
@@ -321,7 +314,7 @@ inline __device__ void bytestream_readbe(volatile orc_bytestream_s* bs,
  * @param[in] numbits number of bits
  * @param[out] result decoded value
  */
-inline __device__ void bytestream_readbe(volatile orc_bytestream_s* bs,
+inline __device__ void bytestream_readbe(orc_bytestream_s* bs,
                                          int bitpos,
                                          uint32_t numbits,
                                          uint64_t& result)
@@ -337,7 +330,7 @@ inline __device__ void bytestream_readbe(volatile orc_bytestream_s* bs,
  * @param[in] numbits number of bits
  * @param[out] result decoded value
  */
-inline __device__ void bytestream_readbe(volatile orc_bytestream_s* bs,
+inline __device__ void bytestream_readbe(orc_bytestream_s* bs,
                                          int bitpos,
                                          uint32_t numbits,
                                          int64_t& result)
@@ -354,7 +347,7 @@ inline __device__ void bytestream_readbe(volatile orc_bytestream_s* bs,
  * @return length of varint in bytes
  */
 template <class T>
-inline __device__ uint32_t varint_length(volatile orc_bytestream_s* bs, int pos)
+inline __device__ uint32_t varint_length(orc_bytestream_s* bs, int pos)
 {
   if (bytestream_readbyte(bs, pos) > 0x7f) {
     uint32_t next32 = bytestream_readu32(bs, pos + 1);
@@ -392,7 +385,7 @@ inline __device__ uint32_t varint_length(volatile orc_bytestream_s* bs, int pos)
  * @return new position in byte stream buffer
  */
 template <class T>
-inline __device__ int decode_base128_varint(volatile orc_bytestream_s* bs, int pos, T& result)
+inline __device__ int decode_base128_varint(orc_bytestream_s* bs, int pos, T& result)
 {
   uint32_t v = bytestream_readbyte(bs, pos++);
   if (v > 0x7f) {
@@ -446,7 +439,7 @@ inline __device__ int decode_base128_varint(volatile orc_bytestream_s* bs, int p
 /**
  * @brief Decodes a signed int128 encoded as base-128 varint (used for decimals)
  */
-inline __device__ __int128_t decode_varint128(volatile orc_bytestream_s* bs, int pos)
+inline __device__ __int128_t decode_varint128(orc_bytestream_s* bs, int pos)
 {
   auto byte                  = bytestream_readbyte(bs, pos++);
   __int128_t const sign_mask = -(int32_t)(byte & 1);
@@ -463,7 +456,7 @@ inline __device__ __int128_t decode_varint128(volatile orc_bytestream_s* bs, int
 /**
  * @brief Decodes an unsigned 32-bit varint
  */
-inline __device__ int decode_varint(volatile orc_bytestream_s* bs, int pos, uint32_t& result)
+inline __device__ int decode_varint(orc_bytestream_s* bs, int pos, uint32_t& result)
 {
   uint32_t u;
   pos    = decode_base128_varint<uint32_t>(bs, pos, u);
@@ -474,7 +467,7 @@ inline __device__ int decode_varint(volatile orc_bytestream_s* bs, int pos, uint
 /**
  * @brief Decodes an unsigned 64-bit varint
  */
-inline __device__ int decode_varint(volatile orc_bytestream_s* bs, int pos, uint64_t& result)
+inline __device__ int decode_varint(orc_bytestream_s* bs, int pos, uint64_t& result)
 {
   uint64_t u;
   pos    = decode_base128_varint<uint64_t>(bs, pos, u);
@@ -485,7 +478,7 @@ inline __device__ int decode_varint(volatile orc_bytestream_s* bs, int pos, uint
 /**
  * @brief Signed version of 32-bit decode_varint
  */
-inline __device__ int decode_varint(volatile orc_bytestream_s* bs, int pos, int32_t& result)
+inline __device__ int decode_varint(orc_bytestream_s* bs, int pos, int32_t& result)
 {
   uint32_t u;
   pos    = decode_base128_varint<uint32_t>(bs, pos, u);
@@ -496,7 +489,7 @@ inline __device__ int decode_varint(volatile orc_bytestream_s* bs, int pos, int3
 /**
  * @brief Signed version of 64-bit decode_varint
  */
-inline __device__ int decode_varint(volatile orc_bytestream_s* bs, int pos, int64_t& result)
+inline __device__ int decode_varint(orc_bytestream_s* bs, int pos, int64_t& result)
 {
   uint64_t u;
   pos    = decode_base128_varint<uint64_t>(bs, pos, u);
@@ -514,7 +507,7 @@ inline __device__ int decode_varint(volatile orc_bytestream_s* bs, int pos, int6
  * @return number of values decoded
  */
 template <class T>
-inline __device__ void lengths_to_positions(volatile T* vals, uint32_t numvals, unsigned int t)
+inline __device__ void lengths_to_positions(T* vals, uint32_t numvals, unsigned int t)
 {
   for (uint32_t n = 1; n < numvals; n <<= 1) {
     __syncthreads();
@@ -534,8 +527,8 @@ inline __device__ void lengths_to_positions(volatile T* vals, uint32_t numvals,
  * @return number of values decoded
  */
 template <class T>
-static __device__ uint32_t Integer_RLEv1(
-  orc_bytestream_s* bs, volatile orc_rlev1_state_s* rle, volatile T* vals, uint32_t maxvals, int t)
+static __device__ uint32_t
+Integer_RLEv1(orc_bytestream_s* bs, orc_rlev1_state_s* rle, T* vals, uint32_t maxvals, int t)
 {
   uint32_t numvals, numruns;
   if (t == 0) {
@@ -642,8 +635,8 @@ static const __device__ __constant__ uint8_t ClosestFixedBitsMap[65] = {
  */
 template <class T>
 static __device__ uint32_t Integer_RLEv2(orc_bytestream_s* bs,
-                                         volatile orc_rlev2_state_s* rle,
-                                         volatile T* vals,
+                                         orc_rlev2_state_s* rle,
+                                         T* vals,
                                          uint32_t maxvals,
                                          int t,
                                          bool has_buffered_values = false)
@@ -883,7 +876,7 @@ static __device__ uint32_t Integer_RLEv2(orc_bytestream_s* bs,
  *
  * @return 32-bit value
  */
-inline __device__ uint32_t rle8_read_bool32(volatile uint32_t* vals, uint32_t bitpos)
+inline __device__ uint32_t rle8_read_bool32(uint32_t* vals, uint32_t bitpos)
 {
   uint32_t a = vals[(bitpos >> 5) + 0];
   uint32_t b = vals[(bitpos >> 5) + 1];
@@ -903,11 +896,8 @@ inline __device__ uint32_t rle8_read_bool32(volatile uint32_t* vals, uint32_t bi
  *
  * @return number of values decoded
  */
-static __device__ uint32_t Byte_RLE(orc_bytestream_s* bs,
-                                    volatile orc_byterle_state_s* rle,
-                                    volatile uint8_t* vals,
-                                    uint32_t maxvals,
-                                    int t)
+static __device__ uint32_t
+Byte_RLE(orc_bytestream_s* bs, orc_byterle_state_s* rle, uint8_t* vals, uint32_t maxvals, int t)
 {
   uint32_t numvals, numruns;
   int r, tr;
@@ -1006,8 +996,8 @@ static const __device__ __constant__ int64_t kPow5i[28] = {1,
  * @return number of values decoded
  */
 static __device__ int Decode_Decimals(orc_bytestream_s* bs,
-                                      volatile orc_byterle_state_s* scratch,
-                                      volatile orcdec_state_s::values& vals,
+                                      orc_byterle_state_s* scratch,
+                                      orcdec_state_s::values& vals,
                                       int val_scale,
                                       int numvals,
                                       type_id dtype_id,
@@ -1241,8 +1231,8 @@ __global__ void __launch_bounds__(block_size)
       }
       __syncthreads();
       while (s->top.dict.dict_len > 0) {
-        uint32_t numvals        = min(s->top.dict.dict_len, blockDim.x), len;
-        volatile uint32_t* vals = s->vals.u32;
+        uint32_t numvals = min(s->top.dict.dict_len, blockDim.x), len;
+        uint32_t* vals   = s->vals.u32;
         bytestream_fill(&s->bs, t);
         __syncthreads();
         if (is_rlev1(s->chunk.encoding_kind)) {
@@ -1310,12 +1300,12 @@ static __device__ void DecodeRowPositions(orcdec_state_s* s,
                          min((row_decoder_buffer_size - s->u.rowdec.nz_count) * 2, blockDim.x));
     if (s->chunk.valid_map_base != nullptr) {
       // We have a present stream
-      uint32_t rmax  = s->top.data.end_row - min((uint32_t)first_row, s->top.data.end_row);
-      auto r         = (uint32_t)(s->top.data.cur_row + s->top.data.nrows + t - first_row);
-      uint32_t valid = (t < nrows && r < rmax)
-                         ? (((uint8_t const*)s->chunk.valid_map_base)[r >> 3] >> (r & 7)) & 1
-                         : 0;
-      volatile auto* row_ofs_plus1 = (volatile uint16_t*)&s->u.rowdec.row[s->u.rowdec.nz_count];
+      uint32_t rmax       = s->top.data.end_row - min((uint32_t)first_row, s->top.data.end_row);
+      auto r              = (uint32_t)(s->top.data.cur_row + s->top.data.nrows + t - first_row);
+      uint32_t valid      = (t < nrows && r < rmax)
+                              ? (((uint8_t const*)s->chunk.valid_map_base)[r >> 3] >> (r & 7)) & 1
+                              : 0;
+      auto* row_ofs_plus1 = (uint16_t*)&s->u.rowdec.row[s->u.rowdec.nz_count];
       uint32_t nz_pos, row_plus1, nz_count = s->u.rowdec.nz_count, last_row;
       if (t < nrows) { row_ofs_plus1[t] = valid; }
       lengths_to_positions<uint16_t>(row_ofs_plus1, nrows, t);
diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu
index 73c41e2bbcd..b99826e070e 100644
--- a/cpp/src/io/orc/stripe_enc.cu
+++ b/cpp/src/io/orc/stripe_enc.cu
@@ -24,6 +24,7 @@
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/utilities/bit.hpp>
 
@@ -53,7 +54,7 @@ constexpr bool zero_pll_war = true;
 struct byterle_enc_state_s {
   uint32_t literal_run;
   uint32_t repeat_run;
-  volatile uint32_t rpt_map[(512 / 32) + 1];
+  uint32_t rpt_map[(512 / 32) + 1];
 };
 
 struct intrle_enc_state_s {
@@ -63,7 +64,7 @@ struct intrle_enc_state_s {
   uint32_t literal_w;
   uint32_t hdr_bytes;
   uint32_t pl_bytes;
-  volatile uint32_t delta_map[(512 / 32) + 1];
+  uint32_t delta_map[(512 / 32) + 1];
 };
 
 struct strdata_enc_state_s {
@@ -366,7 +367,7 @@ static __device__ uint32_t IntegerRLE(
   using block_reduce = cub::BlockReduce<T, block_size>;
   uint8_t* dst       = s->stream.data_ptrs[cid] + s->strm_pos[cid];
   uint32_t out_cnt   = 0;
-  __shared__ volatile uint64_t block_vmin;
+  __shared__ uint64_t block_vmin;
 
   while (numvals > 0) {
     T v0               = (t < numvals) ? inbuf[(inpos + t) & inmask] : 0;
@@ -615,7 +616,7 @@ static __device__ void StoreStringData(uint8_t* dst,
  * @param[in] t thread id
  */
 template <class T>
-inline __device__ void lengths_to_positions(volatile T* vals, uint32_t numvals, unsigned int t)
+inline __device__ void lengths_to_positions(T* vals, uint32_t numvals, unsigned int t)
 {
   for (uint32_t n = 1; n < numvals; n <<= 1) {
     __syncthreads();
@@ -836,6 +837,10 @@ __global__ void __launch_bounds__(block_size)
               if (dict_idx > 0x7fff'ffffu) {
                 dict_idx = s->chunk.dict_index[dict_idx & 0x7fff'ffffu];
               }
+              // translate dictionary index to sorted order, if enabled
+              if (s->chunk.dict_data_order != nullptr) {
+                dict_idx = s->chunk.dict_data_order[dict_idx];
+              }
               s->vals.u32[nz_idx] = dict_idx;
             } else {
               string_view value                       = column.element<string_view>(row);
@@ -1143,7 +1148,7 @@ __global__ void __launch_bounds__(256)
                            uint32_t comp_block_align)
 {
   __shared__ __align__(16) StripeStream ss;
-  __shared__ uint8_t* volatile uncomp_base_g;
+  __shared__ uint8_t* uncomp_base_g;
 
   auto const padded_block_header_size = util::round_up_unsafe(block_header_size, comp_block_align);
   auto const padded_comp_block_size   = util::round_up_unsafe(max_comp_blk_size, comp_block_align);
@@ -1196,8 +1201,8 @@ __global__ void __launch_bounds__(1024)
                              uint32_t max_comp_blk_size)
 {
   __shared__ __align__(16) StripeStream ss;
-  __shared__ uint8_t const* volatile comp_src_g;
-  __shared__ uint32_t volatile comp_len_g;
+  __shared__ uint8_t const* comp_src_g;
+  __shared__ uint32_t comp_len_g;
 
   auto const stripe_id = blockIdx.x;
   auto const stream_id = blockIdx.y;
@@ -1260,6 +1265,38 @@ __global__ void __launch_bounds__(1024)
   }
 }
 
+// Holds a non-owning view of a decimal column's element sizes
+struct decimal_column_element_sizes {
+  uint32_t col_idx;
+  device_span<uint32_t> sizes;
+};
+
+// Converts sizes of individual decimal elements to offsets within each row group
+// Conversion is done in-place
+template <int block_size>
+__global__ void decimal_sizes_to_offsets_kernel(device_2dspan<rowgroup_rows const> rg_bounds,
+                                                device_span<decimal_column_element_sizes> sizes)
+{
+  using block_scan = cub::BlockScan<uint32_t, block_size>;
+  __shared__ typename block_scan::TempStorage scan_storage;
+  int const t = threadIdx.x;
+
+  auto const& col_elem_sizes = sizes[blockIdx.x];
+  auto const& row_group      = rg_bounds[blockIdx.y][col_elem_sizes.col_idx];
+  auto const elem_sizes      = col_elem_sizes.sizes.data() + row_group.begin;
+
+  uint32_t initial_value = 0;
+  // Do a series of block sums, storing results in the array as we go
+  for (int64_t pos = 0; pos < row_group.size(); pos += block_size) {
+    auto const tidx    = pos + t;
+    auto tval          = tidx < row_group.size() ? elem_sizes[tidx] : 0u;
+    uint32_t block_sum = 0;
+    block_scan(scan_storage).InclusiveSum(tval, tval, block_sum);
+    if (tidx < row_group.size()) { elem_sizes[tidx] = tval + initial_value; }
+    initial_value += block_sum;
+  }
+}
+
 void EncodeOrcColumnData(device_2dspan<EncChunk const> chunks,
                          device_2dspan<encoder_chunk_streams> streams,
                          rmm::cuda_stream_view stream)
@@ -1368,6 +1405,30 @@ std::optional<writer_compression_statistics> CompressOrcDataStreams(
   }
 }
 
+void decimal_sizes_to_offsets(device_2dspan<rowgroup_rows const> rg_bounds,
+                              std::map<uint32_t, rmm::device_uvector<uint32_t>>& elem_sizes,
+                              rmm::cuda_stream_view stream)
+{
+  if (rg_bounds.count() == 0) return;
+
+  // Convert map to a vector of views of the `elem_sizes` device buffers
+  std::vector<decimal_column_element_sizes> h_sizes;
+  h_sizes.reserve(elem_sizes.size());
+  std::transform(elem_sizes.begin(), elem_sizes.end(), std::back_inserter(h_sizes), [](auto& p) {
+    return decimal_column_element_sizes{p.first, p.second};
+  });
+
+  // Copy the vector of views to the device so that we can pass it to the kernel
+  auto d_sizes = cudf::detail::make_device_uvector_async<decimal_column_element_sizes>(
+    h_sizes, stream, rmm::mr::get_current_device_resource());
+
+  constexpr int block_size = 256;
+  dim3 const grid_size{static_cast<unsigned int>(elem_sizes.size()),        // num decimal columns
+                       static_cast<unsigned int>(rg_bounds.size().first)};  // num rowgroups
+  decimal_sizes_to_offsets_kernel<block_size>
+    <<<grid_size, block_size, 0, stream.value()>>>(rg_bounds, d_sizes);
+}
+
 }  // namespace gpu
 }  // namespace orc
 }  // namespace io
diff --git a/cpp/src/io/orc/stripe_init.cu b/cpp/src/io/orc/stripe_init.cu
index 8eeca504121..b31a4a081d1 100644
--- a/cpp/src/io/orc/stripe_init.cu
+++ b/cpp/src/io/orc/stripe_init.cu
@@ -499,7 +499,7 @@ __global__ void __launch_bounds__(128, 8) gpuParseRowGroupIndex(RowGroup* row_gr
           : row_groups[(s->rowgroup_start + i) * num_columns + blockIdx.x].start_row;
       for (int j = t4; j < rowgroup_size4; j += 4) {
         ((uint32_t*)&row_groups[(s->rowgroup_start + i) * num_columns + blockIdx.x])[j] =
-          ((volatile uint32_t*)&s->rowgroups[i])[j];
+          ((uint32_t*)&s->rowgroups[i])[j];
       }
       row_groups[(s->rowgroup_start + i) * num_columns + blockIdx.x].num_rows = num_rows;
       // Updating in case of struct
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index 3d8bdb4ec97..ac5993e764e 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -29,6 +29,7 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/pinned_host_vector.hpp>
+#include <cudf/detail/utilities/stream_pool.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/bit.hpp>
@@ -50,6 +51,8 @@
 #include <thrust/pair.h>
 #include <thrust/reduce.h>
 #include <thrust/scan.h>
+#include <thrust/sequence.h>
+#include <thrust/sort.h>
 #include <thrust/tabulate.h>
 #include <thrust/transform.h>
 
@@ -867,16 +870,15 @@ encoded_data encode_columns(orc_table_view const& orc_table,
         ck.null_mask_num_rows  = aligned_rowgroups[rg_idx][column.index()].size();
         ck.encoding_kind       = column.orc_encoding();
         ck.type_kind           = column.orc_kind();
-        if (ck.type_kind == TypeKind::STRING) {
-          ck.dict_index = (ck.encoding_kind == DICTIONARY_V2)
-                            ? column.host_stripe_dict(stripe.id).index.data()
-                            : nullptr;
-          ck.dtype_len  = 1;
-        } else {
-          ck.dtype_len = column.type_width();
-        }
-        ck.scale = column.scale();
-        if (ck.type_kind == TypeKind::DECIMAL) { ck.decimal_offsets = column.decimal_offsets(); }
+        auto const is_str_dict =
+          ck.type_kind == TypeKind::STRING and ck.encoding_kind == DICTIONARY_V2;
+        ck.dict_index = is_str_dict ? column.host_stripe_dict(stripe.id).index.data() : nullptr;
+        ck.dict_data_order =
+          is_str_dict ? column.host_stripe_dict(stripe.id).data_order.data() : nullptr;
+        ck.dtype_len = (ck.type_kind == TypeKind::STRING) ? 1 : column.type_width();
+        ck.scale     = column.scale();
+        ck.decimal_offsets =
+          (ck.type_kind == TypeKind::DECIMAL) ? column.decimal_offsets() : nullptr;
       }
     }
   }
@@ -1882,7 +1884,7 @@ encoder_decimal_info decimal_chunk_sizes(orc_table_view& orc_table,
       auto& current_sizes =
         elem_sizes.insert({orc_col.index(), rmm::device_uvector<uint32_t>(orc_col.size(), stream)})
           .first->second;
-      thrust::tabulate(rmm::exec_policy(stream),
+      thrust::tabulate(rmm::exec_policy_nosync(stream),
                        current_sizes.begin(),
                        current_sizes.end(),
                        [d_cols  = device_span<orc_column_device_view const>{orc_table.d_columns},
@@ -1908,25 +1910,14 @@ encoder_decimal_info decimal_chunk_sizes(orc_table_view& orc_table,
                          return varint_size(zigzaged_value);
                        });
 
-      // Compute element offsets within each row group
-      thrust::for_each_n(rmm::exec_policy(stream),
-                         thrust::make_counting_iterator(0ul),
-                         segmentation.num_rowgroups(),
-                         [sizes     = device_span<uint32_t>{current_sizes},
-                          rg_bounds = device_2dspan<rowgroup_rows const>{segmentation.rowgroups},
-                          col_idx   = orc_col.index()] __device__(auto rg_idx) {
-                           auto const& range = rg_bounds[rg_idx][col_idx];
-                           thrust::inclusive_scan(thrust::seq,
-                                                  sizes.begin() + range.begin,
-                                                  sizes.begin() + range.end,
-                                                  sizes.begin() + range.begin);
-                         });
-
       orc_col.attach_decimal_offsets(current_sizes.data());
     }
   }
   if (elem_sizes.empty()) return {};
 
+  // Compute element offsets within each row group
+  gpu::decimal_sizes_to_offsets(segmentation.rowgroups, elem_sizes, stream);
+
   // Gather the row group sizes and copy to host
   auto d_tmp_rowgroup_sizes = rmm::device_uvector<uint32_t>(segmentation.num_rowgroups(), stream);
   std::map<uint32_t, std::vector<uint32_t>> rg_sizes;
@@ -2023,24 +2014,41 @@ struct stripe_dictionaries {
   hostdevice_2dvector<gpu::stripe_dictionary> views;       // descriptors [string_column][stripe]
   std::vector<rmm::device_uvector<uint32_t>> data_owner;   // dictionary data owner, per stripe
   std::vector<rmm::device_uvector<uint32_t>> index_owner;  // dictionary index owner, per stripe
+  std::vector<rmm::device_uvector<uint32_t>> order_owner;  // dictionary order owner, per stripe
 
   // Should be called after encoding is complete to deallocate the dictionary buffers.
   void on_encode_complete(rmm::cuda_stream_view stream)
   {
     data_owner.clear();
     index_owner.clear();
+    order_owner.clear();
 
     for (auto& sd : views.host_view().flat_view()) {
-      sd.data  = {};
-      sd.index = {};
+      sd.data       = {};
+      sd.index      = {};
+      sd.data_order = {};
     }
     views.host_to_device_async(stream);
   }
 };
 
+/**
+ * @brief Compares two rows in a strings column
+ */
+struct string_rows_less {
+  device_span<orc_column_device_view> cols;
+  uint32_t col_idx;
+  __device__ bool operator()(size_type lhs_idx, size_type rhs_idx) const
+  {
+    auto const& col = cols[col_idx];
+    return col.element<string_view>(lhs_idx) < col.element<string_view>(rhs_idx);
+  }
+};
+
 // Build stripe dictionaries for string columns
 stripe_dictionaries build_dictionaries(orc_table_view& orc_table,
                                        file_segmentation const& segmentation,
+                                       bool sort_dictionaries,
                                        rmm::cuda_stream_view stream)
 {
   std::vector<std::vector<rmm::device_uvector<gpu::slot_type>>> hash_maps_storage(
@@ -2091,6 +2099,7 @@ stripe_dictionaries build_dictionaries(orc_table_view& orc_table,
   // Data owners; can be cleared after encode
   std::vector<rmm::device_uvector<uint32_t>> dict_data_owner;
   std::vector<rmm::device_uvector<uint32_t>> dict_index_owner;
+  std::vector<rmm::device_uvector<uint32_t>> dict_order_owner;
   // Make decision about which stripes to encode with dictionary encoding
   for (auto col_idx : orc_table.string_column_indices) {
     auto& str_column = orc_table.column(col_idx);
@@ -2133,15 +2142,61 @@ stripe_dictionaries build_dictionaries(orc_table_view& orc_table,
   gpu::collect_map_entries(stripe_dicts, stream);
   gpu::get_dictionary_indices(stripe_dicts, orc_table.d_columns, stream);
 
-  // Clear map slots; hash map storage is deallocated at the end of this function
-  auto device_dicts_flat = stripe_dicts.device_view().flat_view();
-  thrust::for_each(rmm::exec_policy(stream),
-                   device_dicts_flat.begin(),
-                   device_dicts_flat.end(),
-                   [] __device__(auto& sd) { sd.map_slots = {}; });
-  stripe_dicts.device_to_host_async(stream);
+  // deallocate hash map storage, unused after this point
+  hash_maps_storage.clear();
+
+  // Clear map slots and attach order buffers
+  auto dictionaries_flat = stripe_dicts.host_view().flat_view();
+  for (auto& sd : dictionaries_flat) {
+    if (not sd.is_enabled) { continue; }
+
+    sd.map_slots = {};
+    if (sort_dictionaries) {
+      dict_order_owner.emplace_back(sd.entry_count, stream);
+      sd.data_order = dict_order_owner.back();
+    } else {
+      sd.data_order = {};
+    }
+  }
+  stripe_dicts.host_to_device_async(stream);
+
+  // Sort stripe dictionaries alphabetically
+  if (sort_dictionaries) {
+    auto streams = cudf::detail::fork_streams(stream, std::min<size_t>(dict_order_owner.size(), 8));
+    auto stream_idx = 0;
+    for (auto& sd : dictionaries_flat) {
+      if (not sd.is_enabled) { continue; }
+
+      auto const& current_stream = streams[stream_idx];
+
+      // Sort the dictionary data and create a mapping from the sorted order to the original
+      thrust::sequence(
+        rmm::exec_policy_nosync(current_stream), sd.data_order.begin(), sd.data_order.end());
+      thrust::sort_by_key(rmm::exec_policy_nosync(current_stream),
+                          sd.data.begin(),
+                          sd.data.end(),
+                          sd.data_order.begin(),
+                          string_rows_less{orc_table.d_columns, sd.column_idx});
+
+      // Create the inverse permutation - i.e. the mapping from the original order to the sorted
+      auto order_copy = cudf::detail::make_device_uvector_async<uint32_t>(
+        sd.data_order, current_stream, rmm::mr::get_current_device_resource());
+      thrust::scatter(rmm::exec_policy_nosync(current_stream),
+                      thrust::counting_iterator<uint32_t>(0),
+                      thrust::counting_iterator<uint32_t>(sd.data_order.size()),
+                      order_copy.begin(),
+                      sd.data_order.begin());
+
+      stream_idx = (stream_idx + 1) % streams.size();
+    }
+
+    cudf::detail::join_streams(streams, stream);
+  }
 
-  return {std::move(stripe_dicts), std::move(dict_data_owner), std::move(dict_index_owner)};
+  return {std::move(stripe_dicts),
+          std::move(dict_data_owner),
+          std::move(dict_index_owner),
+          std::move(dict_order_owner)};
 }
 
 /**
@@ -2153,6 +2208,7 @@ stripe_dictionaries build_dictionaries(orc_table_view& orc_table,
  * @param max_stripe_size Maximum size of stripes in the output file
  * @param row_index_stride The row index stride
  * @param enable_dictionary Whether dictionary is enabled
+ * @param sort_dictionaries Whether to sort the dictionaries
  * @param compression_kind The compression kind
  * @param compression_blocksize The block size used for compression
  * @param stats_freq Column statistics granularity type for parquet/orc writers
@@ -2167,6 +2223,7 @@ auto convert_table_to_orc_data(table_view const& input,
                                stripe_size_limits max_stripe_size,
                                size_type row_index_stride,
                                bool enable_dictionary,
+                               bool sort_dictionaries,
                                CompressionKind compression_kind,
                                size_t compression_blocksize,
                                statistics_freq stats_freq,
@@ -2191,7 +2248,7 @@ auto convert_table_to_orc_data(table_view const& input,
   auto segmentation =
     calculate_segmentation(orc_table.columns, std::move(rowgroup_bounds), max_stripe_size);
 
-  auto stripe_dicts    = build_dictionaries(orc_table, segmentation, stream);
+  auto stripe_dicts    = build_dictionaries(orc_table, segmentation, sort_dictionaries, stream);
   auto dec_chunk_sizes = decimal_chunk_sizes(orc_table, segmentation, stream);
 
   auto const uncompressed_block_align = uncomp_block_alignment(compression_kind);
@@ -2325,6 +2382,7 @@ writer::impl::impl(std::unique_ptr<data_sink> sink,
     _compression_blocksize(compression_block_size(_compression_kind)),
     _compression_statistics(options.get_compression_statistics()),
     _stats_freq(options.get_statistics_freq()),
+    _sort_dictionaries{options.get_enable_dictionary_sort()},
     _single_write_mode(mode),
     _kv_meta(options.get_key_value_metadata()),
     _out_sink(std::move(sink))
@@ -2346,6 +2404,7 @@ writer::impl::impl(std::unique_ptr<data_sink> sink,
     _compression_blocksize(compression_block_size(_compression_kind)),
     _compression_statistics(options.get_compression_statistics()),
     _stats_freq(options.get_statistics_freq()),
+    _sort_dictionaries{options.get_enable_dictionary_sort()},
     _single_write_mode(mode),
     _kv_meta(options.get_key_value_metadata()),
     _out_sink(std::move(sink))
@@ -2393,6 +2452,7 @@ void writer::impl::write(table_view const& input)
                                        _max_stripe_size,
                                        _row_index_stride,
                                        _enable_dictionary,
+                                       _sort_dictionaries,
                                        _compression_kind,
                                        _compression_blocksize,
                                        _stats_freq,
diff --git a/cpp/src/io/orc/writer_impl.hpp b/cpp/src/io/orc/writer_impl.hpp
index 67c65eb9a37..0d1a83f3d85 100644
--- a/cpp/src/io/orc/writer_impl.hpp
+++ b/cpp/src/io/orc/writer_impl.hpp
@@ -346,6 +346,7 @@ class writer::impl {
   size_t const _compression_blocksize;
   std::shared_ptr<writer_compression_statistics> _compression_statistics;  // Optional output
   statistics_freq const _stats_freq;
+  bool const _sort_dictionaries;
   single_write_mode const _single_write_mode;  // Special parameter only used by `write()` to
                                                // indicate that we are guaranteeing a single table
                                                // write. This enables some internal optimizations.
diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu
index 9ff1869edde..53ff31ab0a7 100644
--- a/cpp/src/io/parquet/chunk_dict.cu
+++ b/cpp/src/io/parquet/chunk_dict.cu
@@ -24,10 +24,8 @@
 
 #include <cuda/atomic>
 
-namespace cudf {
-namespace io {
-namespace parquet {
-namespace gpu {
+namespace cudf::io::parquet::detail {
+
 namespace {
 constexpr int DEFAULT_BLOCK_SIZE = 256;
 }
@@ -101,7 +99,7 @@ struct map_find_fn {
 
 template <int block_size>
 __global__ void __launch_bounds__(block_size)
-  populate_chunk_hash_maps_kernel(cudf::detail::device_2dspan<gpu::PageFragment const> frags)
+  populate_chunk_hash_maps_kernel(cudf::detail::device_2dspan<PageFragment const> frags)
 {
   auto col_idx = blockIdx.y;
   auto block_x = blockIdx.x;
@@ -226,7 +224,7 @@ __global__ void __launch_bounds__(block_size)
 
 template <int block_size>
 __global__ void __launch_bounds__(block_size)
-  get_dictionary_indices_kernel(cudf::detail::device_2dspan<gpu::PageFragment const> frags)
+  get_dictionary_indices_kernel(cudf::detail::device_2dspan<PageFragment const> frags)
 {
   auto col_idx = blockIdx.y;
   auto block_x = blockIdx.x;
@@ -276,7 +274,7 @@ void initialize_chunk_hash_maps(device_span<EncColumnChunk> chunks, rmm::cuda_st
     <<<chunks.size(), block_size, 0, stream.value()>>>(chunks);
 }
 
-void populate_chunk_hash_maps(cudf::detail::device_2dspan<gpu::PageFragment const> frags,
+void populate_chunk_hash_maps(cudf::detail::device_2dspan<PageFragment const> frags,
                               rmm::cuda_stream_view stream)
 {
   dim3 const dim_grid(frags.size().second, frags.size().first);
@@ -290,14 +288,11 @@ void collect_map_entries(device_span<EncColumnChunk> chunks, rmm::cuda_stream_vi
   collect_map_entries_kernel<block_size><<<chunks.size(), block_size, 0, stream.value()>>>(chunks);
 }
 
-void get_dictionary_indices(cudf::detail::device_2dspan<gpu::PageFragment const> frags,
+void get_dictionary_indices(cudf::detail::device_2dspan<PageFragment const> frags,
                             rmm::cuda_stream_view stream)
 {
   dim3 const dim_grid(frags.size().second, frags.size().first);
   get_dictionary_indices_kernel<DEFAULT_BLOCK_SIZE>
     <<<dim_grid, DEFAULT_BLOCK_SIZE, 0, stream.value()>>>(frags);
 }
-}  // namespace gpu
-}  // namespace parquet
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp
index 5c7b8ca3f8c..5a2b8aa8f2a 100644
--- a/cpp/src/io/parquet/compact_protocol_reader.cpp
+++ b/cpp/src/io/parquet/compact_protocol_reader.cpp
@@ -21,9 +21,7 @@
 #include <functional>
 #include <tuple>
 
-namespace cudf {
-namespace io {
-namespace parquet {
+namespace cudf::io::parquet::detail {
 
 /**
  * @brief Base class for parquet field functors.
@@ -341,61 +339,6 @@ struct parquet_field_struct_list : public parquet_field_list<T> {
   }
 };
 
-// TODO(ets): replace current union handling (which mirrors thrift) to use std::optional fields
-// in a struct
-/**
- * @brief Functor to read a union member from CompactProtocolReader
- *
- * @tparam is_empty True if tparam `T` type is empty type, else false.
- *
- * @return True if field types mismatch or if the process of reading a
- * union member fails
- */
-template <typename T, bool is_empty = false>
-class ParquetFieldUnionFunctor : public parquet_field {
-  bool& is_set;
-  T& val;
-
- public:
-  ParquetFieldUnionFunctor(int f, bool& b, T& v) : parquet_field(f), is_set(b), val(v) {}
-
-  inline bool operator()(CompactProtocolReader* cpr, int field_type)
-  {
-    if (field_type != ST_FLD_STRUCT) {
-      return true;
-    } else {
-      is_set = true;
-      return !cpr->read(&val);
-    }
-  }
-};
-
-template <typename T>
-class ParquetFieldUnionFunctor<T, true> : public parquet_field {
-  bool& is_set;
-  T& val;
-
- public:
-  ParquetFieldUnionFunctor(int f, bool& b, T& v) : parquet_field(f), is_set(b), val(v) {}
-
-  inline bool operator()(CompactProtocolReader* cpr, int field_type)
-  {
-    if (field_type != ST_FLD_STRUCT) {
-      return true;
-    } else {
-      is_set = true;
-      cpr->skip_struct_field(field_type);
-      return false;
-    }
-  }
-};
-
-template <typename T>
-ParquetFieldUnionFunctor<T, std::is_empty_v<T>> ParquetFieldUnion(int f, bool& b, T& v)
-{
-  return ParquetFieldUnionFunctor<T, std::is_empty_v<T>>(f, b, v);
-}
-
 /**
  * @brief Functor to read a binary from CompactProtocolReader
  *
@@ -597,34 +540,38 @@ bool CompactProtocolReader::read(FileMetaData* f)
 
 bool CompactProtocolReader::read(SchemaElement* s)
 {
+  using optional_converted_type =
+    parquet_field_optional<ConvertedType, parquet_field_enum<ConvertedType>>;
+  using optional_logical_type =
+    parquet_field_optional<LogicalType, parquet_field_struct<LogicalType>>;
   auto op = std::make_tuple(parquet_field_enum<Type>(1, s->type),
                             parquet_field_int32(2, s->type_length),
                             parquet_field_enum<FieldRepetitionType>(3, s->repetition_type),
                             parquet_field_string(4, s->name),
                             parquet_field_int32(5, s->num_children),
-                            parquet_field_enum<ConvertedType>(6, s->converted_type),
+                            optional_converted_type(6, s->converted_type),
                             parquet_field_int32(7, s->decimal_scale),
                             parquet_field_int32(8, s->decimal_precision),
                             parquet_field_optional<int32_t, parquet_field_int32>(9, s->field_id),
-                            parquet_field_struct(10, s->logical_type));
+                            optional_logical_type(10, s->logical_type));
   return function_builder(this, op);
 }
 
 bool CompactProtocolReader::read(LogicalType* l)
 {
-  auto op =
-    std::make_tuple(ParquetFieldUnion(1, l->isset.STRING, l->STRING),
-                    ParquetFieldUnion(2, l->isset.MAP, l->MAP),
-                    ParquetFieldUnion(3, l->isset.LIST, l->LIST),
-                    ParquetFieldUnion(4, l->isset.ENUM, l->ENUM),
-                    ParquetFieldUnion(5, l->isset.DECIMAL, l->DECIMAL),  // read the struct
-                    ParquetFieldUnion(6, l->isset.DATE, l->DATE),
-                    ParquetFieldUnion(7, l->isset.TIME, l->TIME),            //  read the struct
-                    ParquetFieldUnion(8, l->isset.TIMESTAMP, l->TIMESTAMP),  //  read the struct
-                    ParquetFieldUnion(10, l->isset.INTEGER, l->INTEGER),     //  read the struct
-                    ParquetFieldUnion(11, l->isset.UNKNOWN, l->UNKNOWN),
-                    ParquetFieldUnion(12, l->isset.JSON, l->JSON),
-                    ParquetFieldUnion(13, l->isset.BSON, l->BSON));
+  auto op = std::make_tuple(
+    parquet_field_union_enumerator(1, l->type),
+    parquet_field_union_enumerator(2, l->type),
+    parquet_field_union_enumerator(3, l->type),
+    parquet_field_union_enumerator(4, l->type),
+    parquet_field_union_struct<LogicalType::Type, DecimalType>(5, l->type, l->decimal_type),
+    parquet_field_union_enumerator(6, l->type),
+    parquet_field_union_struct<LogicalType::Type, TimeType>(7, l->type, l->time_type),
+    parquet_field_union_struct<LogicalType::Type, TimestampType>(8, l->type, l->timestamp_type),
+    parquet_field_union_struct<LogicalType::Type, IntType>(10, l->type, l->int_type),
+    parquet_field_union_enumerator(11, l->type),
+    parquet_field_union_enumerator(12, l->type),
+    parquet_field_union_enumerator(13, l->type));
   return function_builder(this, op);
 }
 
@@ -650,9 +597,9 @@ bool CompactProtocolReader::read(TimestampType* t)
 
 bool CompactProtocolReader::read(TimeUnit* u)
 {
-  auto op = std::make_tuple(ParquetFieldUnion(1, u->isset.MILLIS, u->MILLIS),
-                            ParquetFieldUnion(2, u->isset.MICROS, u->MICROS),
-                            ParquetFieldUnion(3, u->isset.NANOS, u->NANOS));
+  auto op = std::make_tuple(parquet_field_union_enumerator(1, u->type),
+                            parquet_field_union_enumerator(2, u->type),
+                            parquet_field_union_enumerator(3, u->type));
   return function_builder(this, op);
 }
 
@@ -769,12 +716,15 @@ bool CompactProtocolReader::read(ColumnIndex* c)
 
 bool CompactProtocolReader::read(Statistics* s)
 {
-  auto op = std::make_tuple(parquet_field_binary(1, s->max),
-                            parquet_field_binary(2, s->min),
-                            parquet_field_int64(3, s->null_count),
-                            parquet_field_int64(4, s->distinct_count),
-                            parquet_field_binary(5, s->max_value),
-                            parquet_field_binary(6, s->min_value));
+  using optional_binary = parquet_field_optional<std::vector<uint8_t>, parquet_field_binary>;
+  using optional_int64  = parquet_field_optional<int64_t, parquet_field_int64>;
+
+  auto op = std::make_tuple(optional_binary(1, s->max),
+                            optional_binary(2, s->min),
+                            optional_int64(3, s->null_count),
+                            optional_int64(4, s->distinct_count),
+                            optional_binary(5, s->max_value),
+                            optional_binary(6, s->min_value));
   return function_builder(this, op);
 }
 
@@ -870,6 +820,4 @@ int CompactProtocolReader::WalkSchema(
   }
 }
 
-}  // namespace parquet
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/compact_protocol_reader.hpp b/cpp/src/io/parquet/compact_protocol_reader.hpp
index 619815db503..cbb4161b138 100644
--- a/cpp/src/io/parquet/compact_protocol_reader.hpp
+++ b/cpp/src/io/parquet/compact_protocol_reader.hpp
@@ -25,9 +25,8 @@
 #include <utility>
 #include <vector>
 
-namespace cudf {
-namespace io {
-namespace parquet {
+namespace cudf::io::parquet::detail {
+
 /**
  * @brief Class for parsing Parquet's Thrift Compact Protocol encoded metadata
  *
@@ -147,6 +146,4 @@ class CompactProtocolReader {
   friend class parquet_field_struct_blob;
 };
 
-}  // namespace parquet
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/compact_protocol_writer.cpp b/cpp/src/io/parquet/compact_protocol_writer.cpp
index 60bc8984d81..fbeda7f1099 100644
--- a/cpp/src/io/parquet/compact_protocol_writer.cpp
+++ b/cpp/src/io/parquet/compact_protocol_writer.cpp
@@ -16,9 +16,9 @@
 
 #include "compact_protocol_writer.hpp"
 
-namespace cudf {
-namespace io {
-namespace parquet {
+#include <cudf/utilities/error.hpp>
+
+namespace cudf::io::parquet::detail {
 
 /**
  * @brief Parquet CompactProtocolWriter class
@@ -48,13 +48,11 @@ size_t CompactProtocolWriter::write(DecimalType const& decimal)
 size_t CompactProtocolWriter::write(TimeUnit const& time_unit)
 {
   CompactProtocolFieldWriter c(*this);
-  auto const isset = time_unit.isset;
-  if (isset.MILLIS) {
-    c.field_struct(1, time_unit.MILLIS);
-  } else if (isset.MICROS) {
-    c.field_struct(2, time_unit.MICROS);
-  } else if (isset.NANOS) {
-    c.field_struct(3, time_unit.NANOS);
+  switch (time_unit.type) {
+    case TimeUnit::MILLIS:
+    case TimeUnit::MICROS:
+    case TimeUnit::NANOS: c.field_empty_struct(time_unit.type); break;
+    default: CUDF_FAIL("Trying to write an invalid TimeUnit " + std::to_string(time_unit.type));
   }
   return c.value();
 }
@@ -86,31 +84,29 @@ size_t CompactProtocolWriter::write(IntType const& integer)
 size_t CompactProtocolWriter::write(LogicalType const& logical_type)
 {
   CompactProtocolFieldWriter c(*this);
-  auto const isset = logical_type.isset;
-  if (isset.STRING) {
-    c.field_struct(1, logical_type.STRING);
-  } else if (isset.MAP) {
-    c.field_struct(2, logical_type.MAP);
-  } else if (isset.LIST) {
-    c.field_struct(3, logical_type.LIST);
-  } else if (isset.ENUM) {
-    c.field_struct(4, logical_type.ENUM);
-  } else if (isset.DECIMAL) {
-    c.field_struct(5, logical_type.DECIMAL);
-  } else if (isset.DATE) {
-    c.field_struct(6, logical_type.DATE);
-  } else if (isset.TIME) {
-    c.field_struct(7, logical_type.TIME);
-  } else if (isset.TIMESTAMP) {
-    c.field_struct(8, logical_type.TIMESTAMP);
-  } else if (isset.INTEGER) {
-    c.field_struct(10, logical_type.INTEGER);
-  } else if (isset.UNKNOWN) {
-    c.field_struct(11, logical_type.UNKNOWN);
-  } else if (isset.JSON) {
-    c.field_struct(12, logical_type.JSON);
-  } else if (isset.BSON) {
-    c.field_struct(13, logical_type.BSON);
+  switch (logical_type.type) {
+    case LogicalType::STRING:
+    case LogicalType::MAP:
+    case LogicalType::LIST:
+    case LogicalType::ENUM:
+    case LogicalType::DATE:
+    case LogicalType::UNKNOWN:
+    case LogicalType::JSON:
+    case LogicalType::BSON: c.field_empty_struct(logical_type.type); break;
+    case LogicalType::DECIMAL:
+      c.field_struct(LogicalType::DECIMAL, logical_type.decimal_type.value());
+      break;
+    case LogicalType::TIME:
+      c.field_struct(LogicalType::TIME, logical_type.time_type.value());
+      break;
+    case LogicalType::TIMESTAMP:
+      c.field_struct(LogicalType::TIMESTAMP, logical_type.timestamp_type.value());
+      break;
+    case LogicalType::INTEGER:
+      c.field_struct(LogicalType::INTEGER, logical_type.int_type.value());
+      break;
+    default:
+      CUDF_FAIL("Trying to write an invalid LogicalType " + std::to_string(logical_type.type));
   }
   return c.value();
 }
@@ -126,20 +122,15 @@ size_t CompactProtocolWriter::write(SchemaElement const& s)
   c.field_string(4, s.name);
 
   if (s.type == UNDEFINED_TYPE) { c.field_int(5, s.num_children); }
-  if (s.converted_type != UNKNOWN) {
-    c.field_int(6, s.converted_type);
+  if (s.converted_type.has_value()) {
+    c.field_int(6, s.converted_type.value());
     if (s.converted_type == DECIMAL) {
       c.field_int(7, s.decimal_scale);
       c.field_int(8, s.decimal_precision);
     }
   }
-  if (s.field_id) { c.field_int(9, s.field_id.value()); }
-  auto const isset = s.logical_type.isset;
-  // TODO: add handling for all logical types
-  // if (isset.STRING or isset.MAP or isset.LIST or isset.ENUM or isset.DECIMAL or isset.DATE or
-  //    isset.TIME or isset.TIMESTAMP or isset.INTEGER or isset.UNKNOWN or isset.JSON or isset.BSON)
-  //    {
-  if (isset.TIMESTAMP or isset.TIME) { c.field_struct(10, s.logical_type); }
+  if (s.field_id.has_value()) { c.field_int(9, s.field_id.value()); }
+  if (s.logical_type.has_value()) { c.field_struct(10, s.logical_type.value()); }
   return c.value();
 }
 
@@ -197,12 +188,12 @@ size_t CompactProtocolWriter::write(ColumnChunkMetaData const& s)
 size_t CompactProtocolWriter::write(Statistics const& s)
 {
   CompactProtocolFieldWriter c(*this);
-  if (not s.max.empty()) { c.field_binary(1, s.max); }
-  if (not s.min.empty()) { c.field_binary(2, s.min); }
-  if (s.null_count != -1) { c.field_int(3, s.null_count); }
-  if (s.distinct_count != -1) { c.field_int(4, s.distinct_count); }
-  if (not s.max_value.empty()) { c.field_binary(5, s.max_value); }
-  if (not s.min_value.empty()) { c.field_binary(6, s.min_value); }
+  if (s.max.has_value()) { c.field_binary(1, s.max.value()); }
+  if (s.min.has_value()) { c.field_binary(2, s.min.value()); }
+  if (s.null_count.has_value()) { c.field_int(3, s.null_count.value()); }
+  if (s.distinct_count.has_value()) { c.field_int(4, s.distinct_count.value()); }
+  if (s.max_value.has_value()) { c.field_binary(5, s.max_value.value()); }
+  if (s.min_value.has_value()) { c.field_binary(6, s.min_value.value()); }
   return c.value();
 }
 
@@ -225,9 +216,9 @@ size_t CompactProtocolWriter::write(OffsetIndex const& s)
 size_t CompactProtocolWriter::write(ColumnOrder const& co)
 {
   CompactProtocolFieldWriter c(*this);
-  switch (co) {
-    case ColumnOrder::TYPE_ORDER: c.field_empty_struct(1); break;
-    default: break;
+  switch (co.type) {
+    case ColumnOrder::TYPE_ORDER: c.field_empty_struct(co.type); break;
+    default: CUDF_FAIL("Trying to write an invalid ColumnOrder " + std::to_string(co.type));
   }
   return c.value();
 }
@@ -391,6 +382,4 @@ inline void CompactProtocolFieldWriter::set_current_field(int const& field)
   current_field_value = field;
 }
 
-}  // namespace parquet
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/compact_protocol_writer.hpp b/cpp/src/io/parquet/compact_protocol_writer.hpp
index 26d66527aa5..4849a814b14 100644
--- a/cpp/src/io/parquet/compact_protocol_writer.hpp
+++ b/cpp/src/io/parquet/compact_protocol_writer.hpp
@@ -25,9 +25,7 @@
 #include <string>
 #include <vector>
 
-namespace cudf {
-namespace io {
-namespace parquet {
+namespace cudf::io::parquet::detail {
 
 /**
  * @brief Class for parsing Parquet's Thrift Compact Protocol encoded metadata
@@ -115,6 +113,4 @@ class CompactProtocolFieldWriter {
   inline void set_current_field(int const& field);
 };
 
-}  // namespace parquet
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/decode_preprocess.cu b/cpp/src/io/parquet/decode_preprocess.cu
index 8de3702bc2e..544c93ee616 100644
--- a/cpp/src/io/parquet/decode_preprocess.cu
+++ b/cpp/src/io/parquet/decode_preprocess.cu
@@ -23,10 +23,7 @@
 #include <rmm/exec_policy.hpp>
 #include <thrust/reduce.h>
 
-namespace cudf {
-namespace io {
-namespace parquet {
-namespace gpu {
+namespace cudf::io::parquet::detail {
 
 namespace {
 
@@ -411,7 +408,4 @@ void ComputePageSizes(cudf::detail::hostdevice_vector<PageInfo>& pages,
   }
 }
 
-}  // namespace gpu
-}  // namespace parquet
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/delta_binary.cuh b/cpp/src/io/parquet/delta_binary.cuh
index 2382e4aafdf..ccc28791071 100644
--- a/cpp/src/io/parquet/delta_binary.cuh
+++ b/cpp/src/io/parquet/delta_binary.cuh
@@ -18,7 +18,7 @@
 
 #include "page_decode.cuh"
 
-namespace cudf::io::parquet::gpu {
+namespace cudf::io::parquet::detail {
 
 // DELTA_XXX encoding support
 //
@@ -39,21 +39,15 @@ namespace cudf::io::parquet::gpu {
 // per mini-block. While encoding, the lowest delta value is subtracted from all the deltas in the
 // block to ensure that all encoded values are positive. The deltas for each mini-block are bit
 // packed using the same encoding as the RLE/Bit-Packing Hybrid encoder.
-//
-// DELTA_BYTE_ARRAY encoding (incremental encoding or front compression), is used for BYTE_ARRAY
-// columns. For each element in a sequence of strings, a prefix length from the preceding string
-// and a suffix is stored. The prefix lengths are DELTA_BINARY_PACKED encoded. The suffixes are
-// encoded with DELTA_LENGTH_BYTE_ARRAY encoding, which is a DELTA_BINARY_PACKED list of suffix
-// lengths, followed by the concatenated suffix data.
 
-// TODO: The delta encodings use ULEB128 integers, but for now we're only
-// using max 64 bits. Need to see what the performance impact is of using
-// __int128_t rather than int64_t.
-using uleb128_t   = uint64_t;
-using zigzag128_t = int64_t;
+// The largest mini-block size we can currently support.
+constexpr int max_delta_mini_block_size = 64;
 
-// we decode one mini-block at a time. max mini-block size seen is 64.
-constexpr int delta_rolling_buf_size = 128;
+// The first pass decodes `values_per_mb` values, and then the second pass does another
+// batch of size `values_per_mb`. The largest value for values_per_miniblock among the
+// major writers seems to be 64, so 2 * 64 should be good. We save the first value separately
+// since it is not encoded in the first mini-block.
+constexpr int delta_rolling_buf_size = 2 * max_delta_mini_block_size;
 
 /**
  * @brief Read a ULEB128 varint integer
@@ -96,7 +90,8 @@ struct delta_binary_decoder {
   uleb128_t mini_block_count;  // usually 4, chosen such that block_size/mini_block_count is a
                                // multiple of 32
   uleb128_t value_count;       // total values encoded in the block
-  zigzag128_t last_value;      // last value decoded, initialized to first_value from header
+  zigzag128_t first_value;     // initial value, stored in the header
+  zigzag128_t last_value;      // last value decoded
 
   uint32_t values_per_mb;      // block_size / mini_block_count, must be multiple of 32
   uint32_t current_value_idx;  // current value index, initialized to 0 at start of block
@@ -108,6 +103,13 @@ struct delta_binary_decoder {
 
   uleb128_t value[delta_rolling_buf_size];  // circular buffer of delta values
 
+  // returns the value stored in the `value` array at index
+  // `rolling_index<delta_rolling_buf_size>(idx)`. If `idx` is `0`, then return `first_value`.
+  constexpr zigzag128_t value_at(size_type idx)
+  {
+    return idx == 0 ? first_value : value[rolling_index<delta_rolling_buf_size>(idx)];
+  }
+
   // returns the number of values encoded in the block data. when all_values is true,
   // account for the first value in the header. otherwise just count the values encoded
   // in the mini-block data.
@@ -151,7 +153,8 @@ struct delta_binary_decoder {
     block_size       = get_uleb128(d_start, d_end);
     mini_block_count = get_uleb128(d_start, d_end);
     value_count      = get_uleb128(d_start, d_end);
-    last_value       = get_zz128(d_start, d_end);
+    first_value      = get_zz128(d_start, d_end);
+    last_value       = first_value;
 
     current_value_idx = 0;
     values_per_mb     = block_size / mini_block_count;
@@ -185,6 +188,28 @@ struct delta_binary_decoder {
     }
   }
 
+  // given start/end pointers in the data, find the end of the binary encoded block. when done,
+  // `this` will be initialized with the correct start and end positions. returns the end, which is
+  // start of data/next block. should only be called from thread 0.
+  inline __device__ uint8_t const* find_end_of_block(uint8_t const* start, uint8_t const* end)
+  {
+    // read block header
+    init_binary_block(start, end);
+
+    // test for no encoded values. a single value will be in the block header.
+    if (value_count <= 1) { return block_start; }
+
+    // read mini-block headers and skip over data
+    while (current_value_idx < num_encoded_values(false)) {
+      setup_next_mini_block(false);
+    }
+    // calculate the correct end of the block
+    auto const* const new_end = cur_mb == 0 ? block_start : cur_mb_start;
+    // re-init block with correct end
+    init_binary_block(start, new_end);
+    return new_end;
+  }
+
   // decode the current mini-batch of deltas, and convert to values.
   // called by all threads in a warp, currently only one warp supported.
   inline __device__ void calc_mini_block_values(int lane_id)
@@ -192,12 +217,9 @@ struct delta_binary_decoder {
     using cudf::detail::warp_size;
     if (current_value_idx >= value_count) { return; }
 
-    // need to save first value from header on first pass
+    // need to account for the first value from header on first pass
     if (current_value_idx == 0) {
-      if (lane_id == 0) {
-        current_value_idx++;
-        value[0] = last_value;
-      }
+      if (lane_id == 0) { current_value_idx++; }
       __syncwarp();
       if (current_value_idx >= value_count) { return; }
     }
@@ -291,4 +313,4 @@ struct delta_binary_decoder {
   }
 };
 
-}  // namespace cudf::io::parquet::gpu
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/delta_enc.cuh b/cpp/src/io/parquet/delta_enc.cuh
new file mode 100644
index 00000000000..b0a7493fcab
--- /dev/null
+++ b/cpp/src/io/parquet/delta_enc.cuh
@@ -0,0 +1,292 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "parquet_gpu.hpp"
+
+#include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/integer_utils.hpp>
+
+#include <cub/cub.cuh>
+
+namespace cudf::io::parquet::detail {
+
+namespace delta {
+
+inline __device__ void put_uleb128(uint8_t*& p, uleb128_t v)
+{
+  while (v > 0x7f) {
+    *(p++) = v | 0x80;
+    v >>= 7;
+  }
+  *(p++) = v;
+}
+
+inline __device__ void put_zz128(uint8_t*& p, zigzag128_t v)
+{
+  zigzag128_t s = (v < 0);
+  put_uleb128(p, (v ^ -s) * 2 + s);
+}
+
+// A block size of 128, with 4 mini-blocks of 32 values each fits nicely without consuming
+// too much shared memory.
+// The parquet spec requires block_size to be a multiple of 128, and values_per_mini_block
+// to be a multiple of 32.
+// TODO: if these are ever made configurable, be sure to fix the page size calculation in
+// delta_data_len() (page_enc.cu).
+constexpr int block_size            = 128;
+constexpr int num_mini_blocks       = 4;
+constexpr int values_per_mini_block = block_size / num_mini_blocks;
+constexpr int buffer_size           = 2 * block_size;
+
+// An extra sanity checks to enforce compliance with the parquet specification.
+static_assert(block_size % 128 == 0);
+static_assert(values_per_mini_block % 32 == 0);
+
+using block_reduce = cub::BlockReduce<zigzag128_t, block_size>;
+using warp_reduce  = cub::WarpReduce<uleb128_t>;
+using index_scan   = cub::BlockScan<size_type, block_size>;
+
+constexpr int rolling_idx(int index) { return rolling_index<buffer_size>(index); }
+
+// Version of bit packer that can handle up to 64 bits values.
+// T is the type to use for processing. if nbits <= 32 use uint32_t, otherwise unsigned long long
+// (not uint64_t because of atomicOr's typing). allowing this to be selectable since there's a
+// measurable impact to using the wider types.
+template <typename scratch_type>
+inline __device__ void bitpack_mini_block(
+  uint8_t* dst, uleb128_t val, uint32_t count, uint8_t nbits, void* temp_space)
+{
+  using wide_type =
+    std::conditional_t<std::is_same_v<scratch_type, unsigned long long>, __uint128_t, uint64_t>;
+  using cudf::detail::warp_size;
+  scratch_type constexpr mask = sizeof(scratch_type) * 8 - 1;
+  auto constexpr div          = sizeof(scratch_type) * 8;
+
+  auto const lane_id = threadIdx.x % warp_size;
+  auto const warp_id = threadIdx.x / warp_size;
+
+  auto const scratch = reinterpret_cast<scratch_type*>(temp_space) + warp_id * warp_size;
+
+  // zero out scratch
+  scratch[lane_id] = 0;
+  __syncwarp();
+
+  // TODO: see if there is any savings using special packing for easy bitwidths (1,2,4,8,16...)
+  // like what's done for the RLE encoder.
+  if (nbits == div) {
+    if (lane_id < count) {
+      for (int i = 0; i < sizeof(scratch_type); i++) {
+        dst[lane_id * sizeof(scratch_type) + i] = val & 0xff;
+        val >>= 8;
+      }
+    }
+    return;
+  }
+
+  if (lane_id <= count) {
+    // Shift symbol left by up to mask bits.
+    wide_type v2 = val;
+    v2 <<= (lane_id * nbits) & mask;
+
+    // Copy N bit word into two N/2 bit words while following C++ strict aliasing rules.
+    scratch_type v1[2];
+    memcpy(&v1, &v2, sizeof(wide_type));
+
+    // Atomically write result to scratch.
+    if (v1[0]) { atomicOr(scratch + ((lane_id * nbits) / div), v1[0]); }
+    if (v1[1]) { atomicOr(scratch + ((lane_id * nbits) / div) + 1, v1[1]); }
+  }
+  __syncwarp();
+
+  // Copy scratch data to final destination.
+  auto const available_bytes = util::div_rounding_up_safe(count * nbits, 8U);
+  auto const scratch_bytes   = reinterpret_cast<uint8_t const*>(scratch);
+
+  for (uint32_t i = lane_id; i < available_bytes; i += warp_size) {
+    dst[i] = scratch_bytes[i];
+  }
+  __syncwarp();
+}
+
+}  // namespace delta
+
+// Object used to turn a stream of integers into a DELTA_BINARY_PACKED stream. This takes as input
+// 128 values with validity at a time, saving them until there are enough values for a block
+// to be written.
+// T is the input data type (either zigzag128_t or uleb128_t).
+template <typename T>
+class delta_binary_packer {
+ private:
+  uint8_t* _dst;                             // sink to dump encoded values to
+  T* _buffer;                                // buffer to store values to be encoded
+  size_type _current_idx;                    // index of first value in buffer
+  uint32_t _num_values;                      // total number of values to encode
+  size_type _values_in_buffer;               // current number of values stored in _buffer
+  uint8_t _mb_bits[delta::num_mini_blocks];  // bitwidth for each mini-block
+
+  // pointers to shared scratch memory for the warp and block scans/reduces
+  delta::index_scan::TempStorage* _scan_tmp;
+  delta::warp_reduce::TempStorage* _warp_tmp;
+  delta::block_reduce::TempStorage* _block_tmp;
+
+  void* _bitpack_tmp;  // pointer to shared scratch memory used in bitpacking
+
+  // Write the delta binary header. Only call from thread 0.
+  inline __device__ void write_header()
+  {
+    delta::put_uleb128(_dst, delta::block_size);
+    delta::put_uleb128(_dst, delta::num_mini_blocks);
+    delta::put_uleb128(_dst, _num_values);
+    delta::put_zz128(_dst, _buffer[0]);
+  }
+
+  // Write the block header. Only call from thread 0.
+  inline __device__ void write_block_header(zigzag128_t block_min)
+  {
+    delta::put_zz128(_dst, block_min);
+    memcpy(_dst, _mb_bits, 4);
+    _dst += 4;
+  }
+
+  // Signed subtraction with defined wrapping behavior.
+  inline __device__ zigzag128_t subtract(zigzag128_t a, zigzag128_t b)
+  {
+    return static_cast<zigzag128_t>(static_cast<uleb128_t>(a) - static_cast<uleb128_t>(b));
+  }
+
+ public:
+  inline __device__ auto num_values() const { return _num_values; }
+
+  // Initialize the object. Only call from thread 0.
+  inline __device__ void init(uint8_t* dest, uint32_t num_values, T* buffer, void* temp_storage)
+  {
+    _dst              = dest;
+    _num_values       = num_values;
+    _buffer           = buffer;
+    _scan_tmp         = reinterpret_cast<delta::index_scan::TempStorage*>(temp_storage);
+    _warp_tmp         = reinterpret_cast<delta::warp_reduce::TempStorage*>(temp_storage);
+    _block_tmp        = reinterpret_cast<delta::block_reduce::TempStorage*>(temp_storage);
+    _bitpack_tmp      = _buffer + delta::buffer_size;
+    _current_idx      = 0;
+    _values_in_buffer = 0;
+  }
+
+  // Each thread calls this to add its current value.
+  inline __device__ void add_value(T value, bool is_valid)
+  {
+    // Figure out the correct position for the given value.
+    size_type const valid = is_valid;
+    size_type pos;
+    size_type num_valid;
+    delta::index_scan(*_scan_tmp).ExclusiveSum(valid, pos, num_valid);
+
+    if (is_valid) { _buffer[delta::rolling_idx(pos + _current_idx + _values_in_buffer)] = value; }
+    __syncthreads();
+
+    if (threadIdx.x == 0) {
+      _values_in_buffer += num_valid;
+      // if first pass write header
+      if (_current_idx == 0) {
+        write_header();
+        _current_idx = 1;
+        _values_in_buffer -= 1;
+      }
+    }
+    __syncthreads();
+
+    if (_values_in_buffer >= delta::block_size) { flush(); }
+  }
+
+  // Called by each thread to flush data to the sink.
+  inline __device__ uint8_t const* flush()
+  {
+    using cudf::detail::warp_size;
+    __shared__ zigzag128_t block_min;
+
+    int const t       = threadIdx.x;
+    int const warp_id = t / warp_size;
+    int const lane_id = t % warp_size;
+
+    if (_values_in_buffer <= 0) { return _dst; }
+
+    // Calculate delta for this thread.
+    size_type const idx     = _current_idx + t;
+    zigzag128_t const delta = idx < _num_values ? subtract(_buffer[delta::rolling_idx(idx)],
+                                                           _buffer[delta::rolling_idx(idx - 1)])
+                                                : std::numeric_limits<zigzag128_t>::max();
+
+    // Find min delta for the block.
+    auto const min_delta = delta::block_reduce(*_block_tmp).Reduce(delta, cub::Min());
+
+    if (t == 0) { block_min = min_delta; }
+    __syncthreads();
+
+    // Compute frame of reference for the block.
+    uleb128_t const norm_delta = idx < _num_values ? subtract(delta, block_min) : 0;
+
+    // Get max normalized delta for each warp, and use that to determine how many bits to use
+    // for the bitpacking of this warp.
+    zigzag128_t const warp_max =
+      delta::warp_reduce(_warp_tmp[warp_id]).Reduce(norm_delta, cub::Max());
+    __syncwarp();
+
+    if (lane_id == 0) { _mb_bits[warp_id] = sizeof(zigzag128_t) * 8 - __clzll(warp_max); }
+    __syncthreads();
+
+    // write block header
+    if (t == 0) { write_block_header(block_min); }
+    __syncthreads();
+
+    // Now each warp encodes its data...can calculate starting offset with _mb_bits.
+    // NOTE: using a switch here rather than a loop because the compiler produces code that
+    // uses fewer registers.
+    int cumulative_bits = 0;
+    switch (warp_id) {
+      case 3: cumulative_bits += _mb_bits[2]; [[fallthrough]];
+      case 2: cumulative_bits += _mb_bits[1]; [[fallthrough]];
+      case 1: cumulative_bits += _mb_bits[0];
+    }
+    uint8_t* const mb_ptr = _dst + cumulative_bits * delta::values_per_mini_block / 8;
+
+    // encoding happens here
+    auto const warp_idx = _current_idx + warp_id * delta::values_per_mini_block;
+    if (warp_idx < _num_values) {
+      auto const num_enc = min(delta::values_per_mini_block, _num_values - warp_idx);
+      if (_mb_bits[warp_id] > 32) {
+        delta::bitpack_mini_block<unsigned long long>(
+          mb_ptr, norm_delta, num_enc, _mb_bits[warp_id], _bitpack_tmp);
+      } else {
+        delta::bitpack_mini_block<uint32_t>(
+          mb_ptr, norm_delta, num_enc, _mb_bits[warp_id], _bitpack_tmp);
+      }
+    }
+    __syncthreads();
+
+    // Last warp updates global delta ptr.
+    if (warp_id == delta::num_mini_blocks - 1 && lane_id == 0) {
+      _dst              = mb_ptr + _mb_bits[warp_id] * delta::values_per_mini_block / 8;
+      _current_idx      = min(warp_idx + delta::values_per_mini_block, _num_values);
+      _values_in_buffer = max(_values_in_buffer - delta::block_size, 0U);
+    }
+    __syncthreads();
+
+    return _dst;
+  }
+};
+
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/error.hpp b/cpp/src/io/parquet/error.hpp
new file mode 100644
index 00000000000..92b5eebe9fd
--- /dev/null
+++ b/cpp/src/io/parquet/error.hpp
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_scalar.hpp>
+
+#include <cstdint>
+#include <sstream>
+
+namespace cudf::io::parquet {
+
+/**
+ * @brief Wrapper around a `rmm::device_scalar` for use in reporting errors that occur in
+ * kernel calls.
+ *
+ * The `kernel_error` object is created with a `rmm::cuda_stream_view` which is used throughout
+ * the object's lifetime.
+ */
+class kernel_error {
+ private:
+  rmm::device_scalar<int32_t> _error_code;
+
+ public:
+  /**
+   * @brief Construct a new `kernel_error` with an initial value of 0.
+   *
+   * Note: the initial value is set asynchronously.
+   *
+   * @throws `rmm::bad_alloc` if allocating the device memory for `initial_value` fails.
+   * @throws `rmm::cuda_error` if copying `initial_value` to device memory fails.
+   *
+   * @param CUDA stream to use
+   */
+  kernel_error(rmm::cuda_stream_view stream) : _error_code{0, stream} {}
+
+  /**
+   * @brief Return a pointer to the device memory for the error
+   */
+  [[nodiscard]] auto data() { return _error_code.data(); }
+
+  /**
+   * @brief Return the current value of the error
+   *
+   * This uses the stream used to create this instance. This does a synchronize on the stream
+   * this object was instantiated with.
+   */
+  [[nodiscard]] auto value() const { return _error_code.value(_error_code.stream()); }
+
+  /**
+   * @brief Return a hexadecimal string representation of the current error code
+   *
+   * Returned string will have "0x" prepended.
+   */
+  [[nodiscard]] std::string str() const
+  {
+    std::stringstream sstream;
+    sstream << std::hex << value();
+    return "0x" + sstream.str();
+  }
+};
+
+}  // namespace cudf::io::parquet
diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index 230834632dd..0c53877f7c7 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -23,10 +23,7 @@
 #include <rmm/exec_policy.hpp>
 #include <thrust/reduce.h>
 
-namespace cudf {
-namespace io {
-namespace parquet {
-namespace gpu {
+namespace cudf::io::parquet::detail {
 
 namespace {
 
@@ -452,8 +449,13 @@ __global__ void __launch_bounds__(decode_block_size)
   int out_thread0;
   [[maybe_unused]] null_count_back_copier _{s, t};
 
-  if (!setupLocalPageInfo(
-        s, &pages[page_idx], chunks, min_row, num_rows, mask_filter{KERNEL_MASK_GENERAL}, true)) {
+  if (!setupLocalPageInfo(s,
+                          &pages[page_idx],
+                          chunks,
+                          min_row,
+                          num_rows,
+                          mask_filter{decode_kernel_mask::GENERAL},
+                          true)) {
     return;
   }
 
@@ -489,6 +491,7 @@ __global__ void __launch_bounds__(decode_block_size)
       target_pos = min(s->nz_count, src_pos + decode_block_size - out_thread0);
       if (out_thread0 > 32) { target_pos = min(target_pos, s->dict_pos); }
     }
+    // TODO(ets): see if this sync can be removed
     __syncthreads();
     if (t < 32) {
       // decode repetition and definition levels.
@@ -602,14 +605,11 @@ __global__ void __launch_bounds__(decode_block_size)
     }
     __syncthreads();
   }
-  if (t == 0 and s->error != 0) {
-    cuda::atomic_ref<int32_t, cuda::thread_scope_device> ref{*error_code};
-    ref.fetch_or(s->error, cuda::std::memory_order_relaxed);
-  }
+  if (t == 0 and s->error != 0) { set_error(s->error, error_code); }
 }
 
 struct mask_tform {
-  __device__ uint32_t operator()(PageInfo const& p) { return p.kernel_mask; }
+  __device__ uint32_t operator()(PageInfo const& p) { return static_cast<uint32_t>(p.kernel_mask); }
 };
 
 }  // anonymous namespace
@@ -624,7 +624,7 @@ uint32_t GetAggregatedDecodeKernelMask(cudf::detail::hostdevice_vector<PageInfo>
 }
 
 /**
- * @copydoc cudf::io::parquet::gpu::DecodePageData
+ * @copydoc cudf::io::parquet::detail::DecodePageData
  */
 void __host__ DecodePageData(cudf::detail::hostdevice_vector<PageInfo>& pages,
                              cudf::detail::hostdevice_vector<ColumnChunkDesc> const& chunks,
@@ -648,7 +648,4 @@ void __host__ DecodePageData(cudf::detail::hostdevice_vector<PageInfo>& pages,
   }
 }
 
-}  // namespace gpu
-}  // namespace parquet
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh
index cdc29197eb3..4db9bd3904b 100644
--- a/cpp/src/io/parquet/page_decode.cuh
+++ b/cpp/src/io/parquet/page_decode.cuh
@@ -24,7 +24,7 @@
 #include <cuda/atomic>
 #include <cuda/std/tuple>
 
-namespace cudf::io::parquet::gpu {
+namespace cudf::io::parquet::detail {
 
 struct page_state_s {
   constexpr page_state_s() noexcept {}
@@ -753,7 +753,7 @@ __device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_input_value
           // for nested schemas, it's more complicated.  This warp will visit 32 incoming values,
           // however not all of them will necessarily represent a value at this nesting level. so
           // the validity bit for thread t might actually represent output value t-6. the correct
-          // position for thread t's bit is cur_value_count. for cuda 11 we could use
+          // position for thread t's bit is thread_value_count. for cuda 11 we could use
           // __reduce_or_sync(), but until then we have to do a warp reduce.
           WarpReduceOr32(is_valid << thread_value_count);
 
@@ -991,8 +991,15 @@ struct all_types_filter {
  * @brief Functor for setupLocalPageInfo that takes a mask of allowed types.
  */
 struct mask_filter {
-  int mask;
-  __device__ inline bool operator()(PageInfo const& page) { return (page.kernel_mask & mask) != 0; }
+  uint32_t mask;
+
+  __device__ mask_filter(uint32_t m) : mask(m) {}
+  __device__ mask_filter(decode_kernel_mask m) : mask(static_cast<uint32_t>(m)) {}
+
+  __device__ inline bool operator()(PageInfo const& page)
+  {
+    return BitAnd(mask, page.kernel_mask) != 0;
+  }
 };
 
 /**
@@ -1143,7 +1150,8 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s,
               units = cudf::timestamp_ms::period::den;
             } else if (s->col.converted_type == TIMESTAMP_MICROS) {
               units = cudf::timestamp_us::period::den;
-            } else if (s->col.logical_type.TIMESTAMP.unit.isset.NANOS) {
+            } else if (s->col.logical_type.has_value() and
+                       s->col.logical_type->is_timestamp_nanos()) {
               units = cudf::timestamp_ns::period::den;
             }
             if (units and units != s->col.ts_clock_rate) {
@@ -1305,6 +1313,7 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s,
           s->dict_run = 0;
         } break;
         case Encoding::DELTA_BINARY_PACKED:
+        case Encoding::DELTA_BYTE_ARRAY:
           // nothing to do, just don't error
           break;
         default: {
@@ -1384,4 +1393,4 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s,
   return true;
 }
 
-}  // namespace cudf::io::parquet::gpu
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/page_delta_decode.cu b/cpp/src/io/parquet/page_delta_decode.cu
index 2b78dead205..bc025c6fc3e 100644
--- a/cpp/src/io/parquet/page_delta_decode.cu
+++ b/cpp/src/io/parquet/page_delta_decode.cu
@@ -23,10 +23,281 @@
 #include <rmm/exec_policy.hpp>
 #include <thrust/transform_scan.h>
 
-namespace cudf::io::parquet::gpu {
+namespace cudf::io::parquet::detail {
 
 namespace {
 
+constexpr int decode_block_size = 128;
+
+// DELTA_BYTE_ARRAY encoding (incremental encoding or front compression), is used for BYTE_ARRAY
+// columns. For each element in a sequence of strings, a prefix length from the preceding string
+// and a suffix is stored. The prefix lengths are DELTA_BINARY_PACKED encoded. The suffixes are
+// encoded with DELTA_LENGTH_BYTE_ARRAY encoding, which is a DELTA_BINARY_PACKED list of suffix
+// lengths, followed by the concatenated suffix data.
+struct delta_byte_array_decoder {
+  uint8_t const* last_string;       // pointer to last decoded string...needed for its prefix
+  uint8_t const* suffix_char_data;  // pointer to the start of character data
+
+  uint8_t* temp_buf;         // buffer used when skipping values
+  uint32_t start_val;        // decoded strings up to this index will be dumped to temp_buf
+  uint32_t last_string_len;  // length of the last decoded string
+
+  delta_binary_decoder prefixes;  // state of decoder for prefix lengths
+  delta_binary_decoder suffixes;  // state of decoder for suffix lengths
+
+  // initialize the prefixes and suffixes blocks
+  __device__ void init(uint8_t const* start, uint8_t const* end, uint32_t start_idx, uint8_t* temp)
+  {
+    auto const* suffix_start = prefixes.find_end_of_block(start, end);
+    suffix_char_data         = suffixes.find_end_of_block(suffix_start, end);
+    last_string              = nullptr;
+    temp_buf                 = temp;
+    start_val                = start_idx;
+  }
+
+  // kind of like an inclusive scan for strings. takes prefix_len bytes from preceding
+  // string and prepends to the suffix we've already copied into place. called from
+  // within loop over values_in_mb, so this only needs to handle a single warp worth of data
+  // at a time.
+  __device__ void string_scan(uint8_t* strings_out,
+                              uint8_t const* last_string,
+                              uint32_t start_idx,
+                              uint32_t end_idx,
+                              uint32_t offset,
+                              uint32_t lane_id)
+  {
+    using cudf::detail::warp_size;
+
+    // let p(n) === length(prefix(string_n))
+    //
+    // if p(n-1) > p(n), then string_n can be completed when string_n-2 is completed. likewise if
+    // p(m) > p(n), then string_n can be completed with string_m-1. however, if p(m) < p(n), then m
+    // is a "blocker" for string_n; string_n can be completed only after string_m is.
+    //
+    // we will calculate the nearest blocking position for each lane, and then fill in string_0. we
+    // then iterate, finding all lanes that have had their "blocker" filled in and completing them.
+    // when all lanes are filled in, we return. this will still hit the worst case if p(n-1) < p(n)
+    // for all n
+    __shared__ __align__(8) int64_t prefix_lens[warp_size];
+    __shared__ __align__(8) uint8_t const* offsets[warp_size];
+
+    uint32_t const ln_idx   = start_idx + lane_id;
+    uint64_t prefix_len     = ln_idx < end_idx ? prefixes.value_at(ln_idx) : 0;
+    uint8_t* const lane_out = ln_idx < end_idx ? strings_out + offset : nullptr;
+
+    prefix_lens[lane_id] = prefix_len;
+    offsets[lane_id]     = lane_out;
+
+    // if all prefix_len's are zero, then there's nothing to do
+    if (__all_sync(0xffff'ffff, prefix_len == 0)) { return; }
+
+    // find a neighbor to the left that has a prefix length less than this lane. once that
+    // neighbor is complete, this lane can be completed.
+    int blocker = lane_id - 1;
+    while (blocker > 0 && prefix_lens[blocker] != 0 && prefix_len <= prefix_lens[blocker]) {
+      blocker--;
+    }
+
+    // fill in lane 0 (if necessary)
+    if (lane_id == 0 && prefix_len > 0) {
+      memcpy(lane_out, last_string, prefix_len);
+      prefix_lens[0] = prefix_len = 0;
+    }
+    __syncwarp();
+
+    // now fill in blockers until done
+    for (uint32_t i = 1; i < warp_size && i + start_idx < end_idx; i++) {
+      if (prefix_len != 0 && prefix_lens[blocker] == 0 && lane_out != nullptr) {
+        memcpy(lane_out, offsets[blocker], prefix_len);
+        prefix_lens[lane_id] = prefix_len = 0;
+      }
+
+      // check for finished
+      if (__all_sync(0xffff'ffff, prefix_len == 0)) { return; }
+    }
+  }
+
+  // calculate a mini-batch of string values, writing the results to
+  // `strings_out`. starting at global index `start_idx` and decoding
+  // up to `num_values` strings.
+  // called by all threads in a warp. used for strings <= 32 chars.
+  // returns number of bytes written
+  __device__ size_t calculate_string_values(uint8_t* strings_out,
+                                            uint32_t start_idx,
+                                            uint32_t num_values,
+                                            uint32_t lane_id)
+  {
+    using cudf::detail::warp_size;
+    using WarpScan = cub::WarpScan<uint64_t>;
+    __shared__ WarpScan::TempStorage scan_temp;
+
+    if (start_idx >= suffixes.value_count) { return 0; }
+    auto end_idx = start_idx + min(suffixes.values_per_mb, num_values);
+    end_idx      = min(end_idx, static_cast<uint32_t>(suffixes.value_count));
+
+    auto p_strings_out = strings_out;
+    auto p_temp_out    = temp_buf;
+
+    auto copy_batch = [&](uint8_t* out, uint32_t idx, uint32_t end) {
+      uint32_t const ln_idx = idx + lane_id;
+
+      // calculate offsets into suffix data
+      uint64_t const suffix_len = ln_idx < end ? suffixes.value_at(ln_idx) : 0;
+      uint64_t suffix_off       = 0;
+      WarpScan(scan_temp).ExclusiveSum(suffix_len, suffix_off);
+
+      // calculate offsets into string data
+      uint64_t const prefix_len = ln_idx < end ? prefixes.value_at(ln_idx) : 0;
+      uint64_t const string_len = prefix_len + suffix_len;
+
+      // get offset into output for each lane
+      uint64_t string_off, warp_total;
+      WarpScan(scan_temp).ExclusiveSum(string_len, string_off, warp_total);
+      auto const so_ptr = out + string_off;
+
+      // copy suffixes into string data
+      if (ln_idx < end) { memcpy(so_ptr + prefix_len, suffix_char_data + suffix_off, suffix_len); }
+      __syncwarp();
+
+      // copy prefixes into string data.
+      string_scan(out, last_string, idx, end, string_off, lane_id);
+
+      // save the position of the last computed string. this will be used in
+      // the next iteration to reconstruct the string in lane 0.
+      if (ln_idx == end - 1 || (ln_idx < end && lane_id == 31)) {
+        // set last_string to this lane's string
+        last_string     = out + string_off;
+        last_string_len = string_len;
+        // and consume used suffix_char_data
+        suffix_char_data += suffix_off + suffix_len;
+      }
+
+      return warp_total;
+    };
+
+    uint64_t string_total = 0;
+    for (int idx = start_idx; idx < end_idx; idx += warp_size) {
+      auto const n_in_batch = min(warp_size, end_idx - idx);
+      // account for the case where start_val occurs in the middle of this batch
+      if (idx < start_val && idx + n_in_batch > start_val) {
+        // dump idx...start_val into temp_buf
+        copy_batch(p_temp_out, idx, start_val);
+        __syncwarp();
+
+        // start_val...idx + n_in_batch into strings_out
+        auto nbytes = copy_batch(p_strings_out, start_val, idx + n_in_batch);
+        p_strings_out += nbytes;
+        string_total = nbytes;
+      } else {
+        if (idx < start_val) {
+          p_temp_out += copy_batch(p_temp_out, idx, end_idx);
+        } else {
+          auto nbytes = copy_batch(p_strings_out, idx, end_idx);
+          p_strings_out += nbytes;
+          string_total += nbytes;
+        }
+      }
+      __syncwarp();
+    }
+
+    return string_total;
+  }
+
+  // character parallel version of CalculateStringValues(). This is faster for strings longer than
+  // 32 chars.
+  __device__ size_t calculate_string_values_cp(uint8_t* strings_out,
+                                               uint32_t start_idx,
+                                               uint32_t num_values,
+                                               uint32_t lane_id)
+  {
+    using cudf::detail::warp_size;
+    __shared__ __align__(8) uint8_t* so_ptr;
+
+    if (start_idx >= suffixes.value_count) { return; }
+    auto end_idx = start_idx + min(suffixes.values_per_mb, num_values);
+    end_idx      = min(end_idx, static_cast<uint32_t>(suffixes.value_count));
+
+    if (lane_id == 0) { so_ptr = start_idx < start_val ? temp_buf : strings_out; }
+    __syncwarp();
+
+    uint64_t string_total = 0;
+    for (int idx = start_idx; idx < end_idx; idx++) {
+      uint64_t const suffix_len = suffixes.value_at(idx);
+      uint64_t const prefix_len = prefixes.value_at(idx);
+      uint64_t const string_len = prefix_len + suffix_len;
+
+      // copy prefix and suffix data into current strings_out position
+      // for longer strings use a 4-byte version stolen from gather_chars_fn_string_parallel.
+      if (string_len > 64) {
+        if (prefix_len > 0) { wideStrcpy(so_ptr, last_string, prefix_len, lane_id); }
+        if (suffix_len > 0) {
+          wideStrcpy(so_ptr + prefix_len, suffix_char_data, suffix_len, lane_id);
+        }
+      } else {
+        for (int i = lane_id; i < string_len; i += warp_size) {
+          so_ptr[i] = i < prefix_len ? last_string[i] : suffix_char_data[i - prefix_len];
+        }
+      }
+      __syncwarp();
+
+      if (idx >= start_val) { string_total += string_len; }
+
+      if (lane_id == 0) {
+        last_string     = so_ptr;
+        last_string_len = string_len;
+        suffix_char_data += suffix_len;
+        if (idx == start_val - 1) {
+          so_ptr = strings_out;
+        } else {
+          so_ptr += string_len;
+        }
+      }
+      __syncwarp();
+    }
+
+    return string_total;
+  }
+
+  // dump strings before start_val to temp buf
+  __device__ void skip(bool use_char_ll)
+  {
+    using cudf::detail::warp_size;
+    int const t       = threadIdx.x;
+    int const lane_id = t % warp_size;
+
+    // is this even necessary? return if asking to skip the whole block.
+    if (start_val >= prefixes.num_encoded_values(true)) { return; }
+
+    // prefixes and suffixes will have the same parameters (it's checked earlier)
+    auto const batch_size = prefixes.values_per_mb;
+
+    uint32_t skip_pos = 0;
+    while (prefixes.current_value_idx < start_val) {
+      // warp 0 gets prefixes and warp 1 gets suffixes
+      auto* const db = t < 32 ? &prefixes : &suffixes;
+
+      // this will potentially decode past start_val, but that's ok
+      if (t < 64) { db->decode_batch(); }
+      __syncthreads();
+
+      // warp 0 decodes the batch.
+      if (t < 32) {
+        auto const num_to_decode = min(batch_size, start_val - skip_pos);
+        auto const bytes_written =
+          use_char_ll ? calculate_string_values_cp(temp_buf, skip_pos, num_to_decode, lane_id)
+                      : calculate_string_values(temp_buf, skip_pos, num_to_decode, lane_id);
+        // store last_string someplace safe in temp buffer
+        if (t == 0) {
+          memcpy(temp_buf + bytes_written, last_string, last_string_len);
+          last_string = temp_buf + bytes_written;
+        }
+      }
+      skip_pos += prefixes.values_per_mb;
+      __syncthreads();
+    }
+  }
+};
+
 // Decode page data that is DELTA_BINARY_PACKED encoded. This encoding is
 // only used for int32 and int64 physical types (and appears to only be used
 // with V2 page headers; see https://www.mail-archive.com/dev@parquet.apache.org/msg11826.html).
@@ -52,13 +323,9 @@ __global__ void __launch_bounds__(96)
   auto* const db        = &db_state;
   [[maybe_unused]] null_count_back_copier _{s, t};
 
-  if (!setupLocalPageInfo(s,
-                          &pages[page_idx],
-                          chunks,
-                          min_row,
-                          num_rows,
-                          mask_filter{KERNEL_MASK_DELTA_BINARY},
-                          true)) {
+  auto const mask = decode_kernel_mask::DELTA_BINARY;
+  if (!setupLocalPageInfo(
+        s, &pages[page_idx], chunks, min_row, num_rows, mask_filter{mask}, true)) {
     return;
   }
 
@@ -78,6 +345,10 @@ __global__ void __launch_bounds__(96)
   __syncthreads();
 
   auto const batch_size = db->values_per_mb;
+  if (batch_size > max_delta_mini_block_size) {
+    set_error(static_cast<int32_t>(decode_error::DELTA_PARAMS_UNSUPPORTED), error_code);
+    return;
+  }
 
   // if skipped_leaf_values is non-zero, then we need to decode up to the first mini-block
   // that has a value we need.
@@ -93,6 +364,7 @@ __global__ void __launch_bounds__(96)
     } else {  // warp2
       target_pos = min(s->nz_count, src_pos + batch_size);
     }
+    // TODO(ets): see if this sync can be removed
     __syncthreads();
 
     // warp0 will decode the rep/def levels, warp1 will unpack a mini-batch of deltas.
@@ -125,23 +397,12 @@ __global__ void __launch_bounds__(96)
         // place value for this thread
         if (dst_pos >= 0 && sp < target_pos) {
           void* const dst = nesting_info_base[leaf_level_index].data_out + dst_pos * s->dtype_len;
+          auto const val  = db->value_at(sp + skipped_leaf_values);
           switch (s->dtype_len) {
-            case 1:
-              *static_cast<int8_t*>(dst) =
-                db->value[rolling_index<delta_rolling_buf_size>(sp + skipped_leaf_values)];
-              break;
-            case 2:
-              *static_cast<int16_t*>(dst) =
-                db->value[rolling_index<delta_rolling_buf_size>(sp + skipped_leaf_values)];
-              break;
-            case 4:
-              *static_cast<int32_t*>(dst) =
-                db->value[rolling_index<delta_rolling_buf_size>(sp + skipped_leaf_values)];
-              break;
-            case 8:
-              *static_cast<int64_t*>(dst) =
-                db->value[rolling_index<delta_rolling_buf_size>(sp + skipped_leaf_values)];
-              break;
+            case 1: *static_cast<int8_t*>(dst) = val; break;
+            case 2: *static_cast<int16_t*>(dst) = val; break;
+            case 4: *static_cast<int32_t*>(dst) = val; break;
+            case 8: *static_cast<int64_t*>(dst) = val; break;
           }
         }
       }
@@ -151,6 +412,161 @@ __global__ void __launch_bounds__(96)
     __syncthreads();
   }
 
+  if (t == 0 and s->error != 0) { set_error(s->error, error_code); }
+}
+
+// Decode page data that is DELTA_BYTE_ARRAY packed. This encoding consists of a DELTA_BINARY_PACKED
+// array of prefix lengths, followed by a DELTA_BINARY_PACKED array of suffix lengths, followed by
+// the suffixes (technically the suffixes are DELTA_LENGTH_BYTE_ARRAY encoded). The latter two can
+// be used to create an offsets array for the suffix data, but then this needs to be combined with
+// the prefix lengths to do the final decode for each value. Because the lengths of the prefixes and
+// suffixes are not encoded in the header, we're going to have to first do a quick pass through them
+// to find the start/end of each structure.
+template <typename level_t>
+__global__ void __launch_bounds__(decode_block_size)
+  gpuDecodeDeltaByteArray(PageInfo* pages,
+                          device_span<ColumnChunkDesc const> chunks,
+                          size_t min_row,
+                          size_t num_rows,
+                          int32_t* error_code)
+{
+  using cudf::detail::warp_size;
+  __shared__ __align__(16) delta_byte_array_decoder db_state;
+  __shared__ __align__(16) page_state_s state_g;
+  __shared__ __align__(16) page_state_buffers_s<delta_rolling_buf_size, 0, 0> state_buffers;
+
+  page_state_s* const s = &state_g;
+  auto* const sb        = &state_buffers;
+  int const page_idx    = blockIdx.x;
+  int const t           = threadIdx.x;
+  int const lane_id     = t % warp_size;
+  auto* const prefix_db = &db_state.prefixes;
+  auto* const suffix_db = &db_state.suffixes;
+  auto* const dba       = &db_state;
+  [[maybe_unused]] null_count_back_copier _{s, t};
+
+  auto const mask = decode_kernel_mask::DELTA_BYTE_ARRAY;
+  if (!setupLocalPageInfo(
+        s, &pages[page_idx], chunks, min_row, num_rows, mask_filter{mask}, true)) {
+    return;
+  }
+
+  bool const has_repetition = s->col.max_level[level_type::REPETITION] > 0;
+
+  // choose a character parallel string copy when the average string is longer than a warp
+  auto const use_char_ll = (s->page.str_bytes / s->page.num_valids) > cudf::detail::warp_size;
+
+  // copying logic from gpuDecodePageData.
+  PageNestingDecodeInfo const* nesting_info_base = s->nesting_info;
+
+  __shared__ level_t rep[delta_rolling_buf_size];  // circular buffer of repetition level values
+  __shared__ level_t def[delta_rolling_buf_size];  // circular buffer of definition level values
+
+  // skipped_leaf_values will always be 0 for flat hierarchies.
+  uint32_t const skipped_leaf_values = s->page.skipped_leaf_values;
+
+  if (t == 0) {
+    // initialize the prefixes and suffixes blocks
+    dba->init(s->data_start, s->data_end, s->page.start_val, s->page.temp_string_buf);
+  }
+  __syncthreads();
+
+  // assert that prefix and suffix have same mini-block size
+  if (prefix_db->values_per_mb != suffix_db->values_per_mb or
+      prefix_db->block_size != suffix_db->block_size or
+      prefix_db->value_count != suffix_db->value_count) {
+    set_error(static_cast<int32_t>(decode_error::DELTA_PARAM_MISMATCH), error_code);
+    return;
+  }
+
+  // pointer to location to output final strings
+  int const leaf_level_index = s->col.max_nesting_depth - 1;
+  auto strings_data          = nesting_info_base[leaf_level_index].string_out;
+
+  auto const batch_size = prefix_db->values_per_mb;
+  if (batch_size > max_delta_mini_block_size) {
+    set_error(static_cast<int32_t>(decode_error::DELTA_PARAMS_UNSUPPORTED), error_code);
+    return;
+  }
+
+  // if this is a bounds page and nested, then we need to skip up front. non-nested will work
+  // its way through the page.
+  int string_pos          = has_repetition ? s->page.start_val : 0;
+  auto const is_bounds_pg = is_bounds_page(s, min_row, num_rows, has_repetition);
+  if (is_bounds_pg && string_pos > 0) { dba->skip(use_char_ll); }
+
+  while (!s->error && (s->input_value_count < s->num_input_values || s->src_pos < s->nz_count)) {
+    uint32_t target_pos;
+    uint32_t const src_pos = s->src_pos;
+
+    if (t < 3 * warp_size) {  // warp 0..2
+      target_pos = min(src_pos + 2 * batch_size, s->nz_count + s->first_row + batch_size);
+    } else {  // warp 3
+      target_pos = min(s->nz_count, src_pos + batch_size);
+    }
+    // TODO(ets): see if this sync can be removed
+    __syncthreads();
+
+    // warp0 will decode the rep/def levels, warp1 will unpack a mini-batch of prefixes, warp 2 will
+    // unpack a mini-batch of suffixes. warp3 waits one cycle for warps 0-2 to produce a batch, and
+    // then stuffs values into the proper location in the output.
+    if (t < warp_size) {
+      // decode repetition and definition levels.
+      // - update validity vectors
+      // - updates offsets (for nested columns)
+      // - produces non-NULL value indices in s->nz_idx for subsequent decoding
+      gpuDecodeLevels<delta_rolling_buf_size, level_t>(s, sb, target_pos, rep, def, t);
+
+    } else if (t < 2 * warp_size) {
+      // warp 1
+      prefix_db->decode_batch();
+
+    } else if (t < 3 * warp_size) {
+      // warp 2
+      suffix_db->decode_batch();
+
+    } else if (src_pos < target_pos) {
+      // warp 3
+
+      int const nproc = min(batch_size, s->page.end_val - string_pos);
+      strings_data += use_char_ll
+                        ? dba->calculate_string_values_cp(strings_data, string_pos, nproc, lane_id)
+                        : dba->calculate_string_values(strings_data, string_pos, nproc, lane_id);
+      string_pos += nproc;
+
+      // process the mini-block in batches of 32
+      for (uint32_t sp = src_pos + lane_id; sp < src_pos + batch_size; sp += 32) {
+        // the position in the output column/buffer
+        int dst_pos = sb->nz_idx[rolling_index<delta_rolling_buf_size>(sp)];
+
+        // handle skip_rows here. flat hierarchies can just skip up to first_row.
+        if (!has_repetition) { dst_pos -= s->first_row; }
+
+        if (dst_pos >= 0 && sp < target_pos) {
+          auto const offptr =
+            reinterpret_cast<size_type*>(nesting_info_base[leaf_level_index].data_out) + dst_pos;
+          auto const src_idx = sp + skipped_leaf_values;
+          *offptr            = prefix_db->value_at(src_idx) + suffix_db->value_at(src_idx);
+        }
+        __syncwarp();
+      }
+
+      if (lane_id == 0) { s->src_pos = src_pos + batch_size; }
+    }
+
+    __syncthreads();
+  }
+
+  // now turn array of lengths into offsets
+  int value_count = nesting_info_base[leaf_level_index].value_count;
+
+  // if no repetition we haven't calculated start/end bounds and instead just skipped
+  // values until we reach first_row. account for that here.
+  if (!has_repetition) { value_count -= s->first_row; }
+
+  auto const offptr = reinterpret_cast<size_type*>(nesting_info_base[leaf_level_index].data_out);
+  block_excl_sum<decode_block_size>(offptr, value_count, s->page.str_offset);
+
   if (t == 0 and s->error != 0) {
     cuda::atomic_ref<int32_t, cuda::thread_scope_device> ref{*error_code};
     ref.fetch_or(s->error, cuda::std::memory_order_relaxed);
@@ -160,7 +576,7 @@ __global__ void __launch_bounds__(96)
 }  // anonymous namespace
 
 /**
- * @copydoc cudf::io::parquet::gpu::DecodeDeltaBinary
+ * @copydoc cudf::io::parquet::detail::DecodeDeltaBinary
  */
 void __host__ DecodeDeltaBinary(cudf::detail::hostdevice_vector<PageInfo>& pages,
                                 cudf::detail::hostdevice_vector<ColumnChunkDesc> const& chunks,
@@ -184,4 +600,29 @@ void __host__ DecodeDeltaBinary(cudf::detail::hostdevice_vector<PageInfo>& pages
   }
 }
 
-}  // namespace cudf::io::parquet::gpu
+/**
+ * @copydoc cudf::io::parquet::gpu::DecodeDeltaByteArray
+ */
+void __host__ DecodeDeltaByteArray(cudf::detail::hostdevice_vector<PageInfo>& pages,
+                                   cudf::detail::hostdevice_vector<ColumnChunkDesc> const& chunks,
+                                   size_t num_rows,
+                                   size_t min_row,
+                                   int level_type_size,
+                                   int32_t* error_code,
+                                   rmm::cuda_stream_view stream)
+{
+  CUDF_EXPECTS(pages.size() > 0, "There is no page to decode");
+
+  dim3 const dim_block(decode_block_size, 1);
+  dim3 const dim_grid(pages.size(), 1);  // 1 threadblock per page
+
+  if (level_type_size == 1) {
+    gpuDecodeDeltaByteArray<uint8_t><<<dim_grid, dim_block, 0, stream.value()>>>(
+      pages.device_ptr(), chunks, min_row, num_rows, error_code);
+  } else {
+    gpuDecodeDeltaByteArray<uint16_t><<<dim_grid, dim_block, 0, stream.value()>>>(
+      pages.device_ptr(), chunks, min_row, num_rows, error_code);
+  }
+}
+
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu
index fe0dbb85124..2b7980c93e9 100644
--- a/cpp/src/io/parquet/page_enc.cu
+++ b/cpp/src/io/parquet/page_enc.cu
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include "delta_enc.cuh"
 #include "parquet_gpu.cuh"
 
 #include <io/utilities/block_utils.cuh>
@@ -21,6 +22,7 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/utilities/assert.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/stream_pool.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -41,16 +43,19 @@
 #include <thrust/scatter.h>
 #include <thrust/tuple.h>
 
-namespace cudf {
-namespace io {
-namespace parquet {
-namespace gpu {
+#include <bitset>
+
+namespace cudf::io::parquet::detail {
 
 namespace {
 
 using ::cudf::detail::device_2dspan;
 
-constexpr uint32_t rle_buffer_size = (1 << 9);
+constexpr int encode_block_size = 128;
+constexpr int rle_buffer_size   = 2 * encode_block_size;
+constexpr int num_encode_warps  = encode_block_size / cudf::detail::warp_size;
+
+constexpr int rolling_idx(int pos) { return rolling_index<rle_buffer_size>(pos); }
 
 // do not truncate statistics
 constexpr int32_t NO_TRUNC_STATS = 0;
@@ -72,6 +77,7 @@ struct frag_init_state_s {
   PageFragment frag;
 };
 
+template <int rle_buf_size>
 struct page_enc_state_s {
   uint8_t* cur;          //!< current output ptr
   uint8_t* rle_out;      //!< current RLE write ptr
@@ -84,14 +90,15 @@ struct page_enc_state_s {
   uint32_t rle_rpt_count;
   uint32_t page_start_val;
   uint32_t chunk_start_val;
-  volatile uint32_t rpt_map[4];
-  volatile uint32_t scratch_red[32];
+  volatile uint32_t rpt_map[num_encode_warps];
   EncPage page;
   EncColumnChunk ck;
   parquet_column_device_view col;
-  uint32_t vals[rle_buffer_size];
+  uint32_t vals[rle_buf_size];
 };
 
+using rle_page_enc_state_s = page_enc_state_s<rle_buffer_size>;
+
 /**
  * @brief Returns the size of the type in the Parquet file.
  */
@@ -208,6 +215,12 @@ void __device__ calculate_frag_size(frag_init_state_s* const s, int t)
   }
 }
 
+/**
+ * @brief Determine the correct page encoding for the given page parameters.
+ *
+ * This is only used by the plain and dictionary encoders. Delta encoders will set the page
+ * encoding directly.
+ */
 Encoding __device__ determine_encoding(PageType page_type,
                                        Type physical_type,
                                        bool use_dictionary,
@@ -219,7 +232,6 @@ Encoding __device__ determine_encoding(PageType page_type,
   switch (page_type) {
     case PageType::DATA_PAGE: return use_dictionary ? Encoding::PLAIN_DICTIONARY : Encoding::PLAIN;
     case PageType::DATA_PAGE_V2:
-      // TODO need to work in delta encodings here when they're added
       return physical_type == BOOLEAN ? Encoding::RLE
              : use_dictionary         ? Encoding::RLE_DICTIONARY
                                       : Encoding::PLAIN;
@@ -239,6 +251,50 @@ struct BitwiseOr {
   }
 };
 
+// I is the column type from the input table
+template <typename I>
+__device__ uint8_t const* delta_encode(page_enc_state_s<0>* s,
+                                       uint32_t valid_count,
+                                       uint64_t* buffer,
+                                       void* temp_space)
+{
+  using output_type = std::conditional_t<std::is_signed_v<I>, zigzag128_t, uleb128_t>;
+  __shared__ delta_binary_packer<output_type> packer;
+
+  auto const t = threadIdx.x;
+  if (t == 0) {
+    packer.init(s->cur, valid_count, reinterpret_cast<output_type*>(buffer), temp_space);
+  }
+  __syncthreads();
+
+  // TODO(ets): in the plain encoder the scaling is a little different for INT32 than INT64.
+  // might need to modify this if there's a big performance hit in the 32-bit case.
+  int32_t const scale = s->col.ts_scale == 0 ? 1 : s->col.ts_scale;
+  for (uint32_t cur_val_idx = 0; cur_val_idx < s->page.num_leaf_values;) {
+    uint32_t const nvals = min(s->page.num_leaf_values - cur_val_idx, delta::block_size);
+
+    size_type const val_idx_in_block = cur_val_idx + t;
+    size_type const val_idx          = s->page_start_val + val_idx_in_block;
+
+    bool const is_valid =
+      (val_idx < s->col.leaf_column->size() && val_idx_in_block < s->page.num_leaf_values)
+        ? s->col.leaf_column->is_valid(val_idx)
+        : false;
+
+    cur_val_idx += nvals;
+
+    output_type v = is_valid ? s->col.leaf_column->element<I>(val_idx) : 0;
+    if (scale < 0) {
+      v /= -scale;
+    } else {
+      v *= scale;
+    }
+    packer.add_value(v, is_valid);
+  }
+
+  return packer.flush();
+}
+
 }  // anonymous namespace
 
 // blockDim {512,1,1}
@@ -326,10 +382,40 @@ __global__ void __launch_bounds__(128)
   }
 }
 
+__device__ size_t delta_data_len(Type physical_type, cudf::type_id type_id, uint32_t num_values)
+{
+  auto const dtype_len_out = physical_type_len(physical_type, type_id);
+  auto const dtype_len     = [&]() -> uint32_t {
+    if (physical_type == INT32) { return int32_logical_len(type_id); }
+    if (physical_type == INT96) { return sizeof(int64_t); }
+    return dtype_len_out;
+  }();
+
+  auto const vals_per_block = delta::block_size;
+  size_t const num_blocks   = util::div_rounding_up_unsafe(num_values, vals_per_block);
+  // need max dtype_len + 1 bytes for min_delta (because we only encode 7 bits per byte)
+  // one byte per mini block for the bitwidth
+  auto const mini_block_header_size = dtype_len + 1 + delta::num_mini_blocks;
+  // each encoded value can be at most sizeof(type) * 8 + 1 bits
+  auto const max_bits = dtype_len * 8 + 1;
+  // each data block will then be max_bits * values per block. vals_per_block is guaranteed to be
+  // divisible by 128 (via static assert on delta::block_size), but do safe division anyway.
+  auto const bytes_per_block = cudf::util::div_rounding_up_unsafe(max_bits * vals_per_block, 8);
+  auto const block_size      = mini_block_header_size + bytes_per_block;
+
+  // delta header is 2 bytes for the block_size, 1 byte for number of mini-blocks,
+  // max 5 bytes for number of values, and max dtype_len + 1 for first value.
+  // TODO: if we ever allow configurable block sizes then this calculation will need to be
+  // modified.
+  auto const header_size = 2 + 1 + 5 + dtype_len + 1;
+
+  return header_size + num_blocks * block_size;
+}
+
 // blockDim {128,1,1}
 __global__ void __launch_bounds__(128)
   gpuInitPages(device_2dspan<EncColumnChunk> chunks,
-               device_span<gpu::EncPage> pages,
+               device_span<EncPage> pages,
                device_span<size_type> page_sizes,
                device_span<size_type> comp_page_sizes,
                device_span<parquet_column_device_view const> col_desc,
@@ -357,6 +443,14 @@ __global__ void __launch_bounds__(128)
     page_g = {};
   }
   __syncthreads();
+
+  // if writing delta encoded values, we're going to need to know the data length to get a guess
+  // at the worst case number of bytes needed to encode.
+  auto const physical_type = col_g.physical_type;
+  auto const type_id       = col_g.leaf_column->type().id();
+  auto const is_use_delta =
+    write_v2_headers && !ck_g.use_dictionary && (physical_type == INT32 || physical_type == INT64);
+
   if (t < 32) {
     uint32_t fragments_in_chunk  = 0;
     uint32_t rows_in_page        = 0;
@@ -406,9 +500,12 @@ __global__ void __launch_bounds__(128)
       }
       __syncwarp();
       if (t == 0) {
-        if (not pages.empty()) pages[ck_g.first_page] = page_g;
-        if (not page_sizes.empty()) page_sizes[ck_g.first_page] = page_g.max_data_size;
-        if (page_grstats) page_grstats[ck_g.first_page] = pagestats_g;
+        if (not pages.empty()) {
+          page_g.kernel_mask     = encode_kernel_mask::PLAIN;
+          pages[ck_g.first_page] = page_g;
+        }
+        if (not page_sizes.empty()) { page_sizes[ck_g.first_page] = page_g.max_data_size; }
+        if (page_grstats) { page_grstats[ck_g.first_page] = pagestats_g; }
       }
       num_pages = 1;
     }
@@ -508,7 +605,12 @@ __global__ void __launch_bounds__(128)
           page_g.num_values         = values_in_page;
           auto const def_level_size = max_RLE_page_size(col_g.num_def_level_bits(), values_in_page);
           auto const rep_level_size = max_RLE_page_size(col_g.num_rep_level_bits(), values_in_page);
-          auto const max_data_size  = page_size + def_level_size + rep_level_size + rle_pad;
+          // get a different bound if using delta encoding
+          if (is_use_delta) {
+            page_size =
+              max(page_size, delta_data_len(physical_type, type_id, page_g.num_leaf_values));
+          }
+          auto const max_data_size = page_size + def_level_size + rep_level_size + rle_pad;
           // page size must fit in 32-bit signed integer
           if (max_data_size > std::numeric_limits<int32_t>::max()) {
             CUDF_UNREACHABLE("page size exceeds maximum for i32");
@@ -528,7 +630,16 @@ __global__ void __launch_bounds__(128)
         }
         __syncwarp();
         if (t == 0) {
-          if (not pages.empty()) { pages[ck_g.first_page + num_pages] = page_g; }
+          if (not pages.empty()) {
+            if (is_use_delta) {
+              page_g.kernel_mask = encode_kernel_mask::DELTA_BINARY;
+            } else if (ck_g.use_dictionary || physical_type == BOOLEAN) {
+              page_g.kernel_mask = encode_kernel_mask::DICTIONARY;
+            } else {
+              page_g.kernel_mask = encode_kernel_mask::PLAIN;
+            }
+            pages[ck_g.first_page + num_pages] = page_g;
+          }
           if (not page_sizes.empty()) {
             page_sizes[ck_g.first_page + num_pages] = page_g.max_data_size;
           }
@@ -792,8 +903,12 @@ inline __device__ void PackLiterals(
  * @param[in] t thread id (0..127)
  */
 static __device__ void RleEncode(
-  page_enc_state_s* s, uint32_t numvals, uint32_t nbits, uint32_t flush, uint32_t t)
+  rle_page_enc_state_s* s, uint32_t numvals, uint32_t nbits, uint32_t flush, uint32_t t)
 {
+  using cudf::detail::warp_size;
+  auto const lane_id = t % warp_size;
+  auto const warp_id = t / warp_size;
+
   uint32_t rle_pos = s->rle_pos;
   uint32_t rle_run = s->rle_run;
 
@@ -801,20 +916,20 @@ static __device__ void RleEncode(
     uint32_t pos = rle_pos + t;
     if (rle_run > 0 && !(rle_run & 1)) {
       // Currently in a long repeat run
-      uint32_t mask = ballot(pos < numvals && s->vals[pos & (rle_buffer_size - 1)] == s->run_val);
+      uint32_t mask = ballot(pos < numvals && s->vals[rolling_idx(pos)] == s->run_val);
       uint32_t rle_rpt_count, max_rpt_count;
-      if (!(t & 0x1f)) { s->rpt_map[t >> 5] = mask; }
+      if (lane_id == 0) { s->rpt_map[warp_id] = mask; }
       __syncthreads();
-      if (t < 32) {
+      if (t < warp_size) {
         uint32_t c32 = ballot(t >= 4 || s->rpt_map[t] != 0xffff'ffffu);
-        if (!t) {
+        if (t == 0) {
           uint32_t last_idx = __ffs(c32) - 1;
           s->rle_rpt_count =
-            last_idx * 32 + ((last_idx < 4) ? __ffs(~s->rpt_map[last_idx]) - 1 : 0);
+            last_idx * warp_size + ((last_idx < 4) ? __ffs(~s->rpt_map[last_idx]) - 1 : 0);
         }
       }
       __syncthreads();
-      max_rpt_count = min(numvals - rle_pos, 128);
+      max_rpt_count = min(numvals - rle_pos, encode_block_size);
       rle_rpt_count = s->rle_rpt_count;
       rle_run += rle_rpt_count << 1;
       rle_pos += rle_rpt_count;
@@ -831,17 +946,17 @@ static __device__ void RleEncode(
       }
     } else {
       // New run or in a literal run
-      uint32_t v0      = s->vals[pos & (rle_buffer_size - 1)];
-      uint32_t v1      = s->vals[(pos + 1) & (rle_buffer_size - 1)];
+      uint32_t v0      = s->vals[rolling_idx(pos)];
+      uint32_t v1      = s->vals[rolling_idx(pos + 1)];
       uint32_t mask    = ballot(pos + 1 < numvals && v0 == v1);
-      uint32_t maxvals = min(numvals - rle_pos, 128);
+      uint32_t maxvals = min(numvals - rle_pos, encode_block_size);
       uint32_t rle_lit_count, rle_rpt_count;
-      if (!(t & 0x1f)) { s->rpt_map[t >> 5] = mask; }
+      if (lane_id == 0) { s->rpt_map[warp_id] = mask; }
       __syncthreads();
-      if (t < 32) {
+      if (t < warp_size) {
         // Repeat run can only start on a multiple of 8 values
-        uint32_t idx8        = (t * 8) >> 5;
-        uint32_t pos8        = (t * 8) & 0x1f;
+        uint32_t idx8        = (t * 8) / warp_size;
+        uint32_t pos8        = (t * 8) % warp_size;
         uint32_t m0          = (idx8 < 4) ? s->rpt_map[idx8] : 0;
         uint32_t m1          = (idx8 < 3) ? s->rpt_map[idx8 + 1] : 0;
         uint32_t needed_mask = kRleRunMask[nbits - 1];
@@ -850,8 +965,8 @@ static __device__ void RleEncode(
           uint32_t rle_run_start = (mask != 0) ? min((__ffs(mask) - 1) * 8, maxvals) : maxvals;
           uint32_t rpt_len       = 0;
           if (rle_run_start < maxvals) {
-            uint32_t idx_cur = rle_run_start >> 5;
-            uint32_t idx_ofs = rle_run_start & 0x1f;
+            uint32_t idx_cur = rle_run_start / warp_size;
+            uint32_t idx_ofs = rle_run_start % warp_size;
             while (idx_cur < 4) {
               m0   = (idx_cur < 4) ? s->rpt_map[idx_cur] : 0;
               m1   = (idx_cur < 3) ? s->rpt_map[idx_cur + 1] : 0;
@@ -860,7 +975,7 @@ static __device__ void RleEncode(
                 rpt_len += __ffs(mask) - 1;
                 break;
               }
-              rpt_len += 32;
+              rpt_len += warp_size;
               idx_cur++;
             }
           }
@@ -931,7 +1046,7 @@ static __device__ void RleEncode(
  * @param[in] flush nonzero if last batch in block
  * @param[in] t thread id (0..127)
  */
-static __device__ void PlainBoolEncode(page_enc_state_s* s,
+static __device__ void PlainBoolEncode(rle_page_enc_state_s* s,
                                        uint32_t numvals,
                                        uint32_t flush,
                                        uint32_t t)
@@ -941,7 +1056,7 @@ static __device__ void PlainBoolEncode(page_enc_state_s* s,
 
   while (rle_pos < numvals) {
     uint32_t pos    = rle_pos + t;
-    uint32_t v      = (pos < numvals) ? s->vals[pos & (rle_buffer_size - 1)] : 0;
+    uint32_t v      = (pos < numvals) ? s->vals[rolling_idx(pos)] : 0;
     uint32_t n      = min(numvals - rle_pos, 128);
     uint32_t nbytes = (n + ((flush) ? 7 : 0)) >> 3;
     if (!nbytes) { break; }
@@ -995,28 +1110,22 @@ __device__ auto julian_days_with_time(int64_t v)
   return std::make_pair(dur_time_of_day_nanos, julian_days);
 }
 
+// this has been split out into its own kernel because of the amount of shared memory required
+// for the state buffer. encode kernels that don't use the RLE buffer can get started while
+// the level data is encoded.
 // blockDim(128, 1, 1)
 template <int block_size>
-__global__ void __launch_bounds__(128, 8)
-  gpuEncodePages(device_span<gpu::EncPage> pages,
-                 device_span<device_span<uint8_t const>> comp_in,
-                 device_span<device_span<uint8_t>> comp_out,
-                 device_span<compression_result> comp_results,
-                 bool write_v2_headers)
+__global__ void __launch_bounds__(block_size, 8) gpuEncodePageLevels(device_span<EncPage> pages,
+                                                                     bool write_v2_headers,
+                                                                     encode_kernel_mask kernel_mask)
 {
-  __shared__ __align__(8) page_enc_state_s state_g;
-  using block_reduce = cub::BlockReduce<uint32_t, block_size>;
-  using block_scan   = cub::BlockScan<uint32_t, block_size>;
-  __shared__ union {
-    typename block_reduce::TempStorage reduce_storage;
-    typename block_scan::TempStorage scan_storage;
-  } temp_storage;
+  __shared__ __align__(8) rle_page_enc_state_s state_g;
 
-  page_enc_state_s* const s = &state_g;
-  auto const t              = threadIdx.x;
+  auto* const s    = &state_g;
+  uint32_t const t = threadIdx.x;
 
   if (t == 0) {
-    state_g = page_enc_state_s{};
+    state_g = rle_page_enc_state_s{};
     s->page = pages[blockIdx.x];
     s->ck   = *s->page.chunk;
     s->col  = *s->ck.col_desc;
@@ -1029,6 +1138,8 @@ __global__ void __launch_bounds__(128, 8)
   }
   __syncthreads();
 
+  if (BitAnd(s->page.kernel_mask, kernel_mask) == 0) { return; }
+
   auto const is_v2 = s->page.page_type == PageType::DATA_PAGE_V2;
 
   // Encode Repetition and Definition levels
@@ -1081,23 +1192,24 @@ __global__ void __launch_bounds__(128, 8)
           } while (is_col_struct);
           return def;
         }();
-        s->vals[(rle_numvals + t) & (rle_buffer_size - 1)] = def_lvl;
+        s->vals[rolling_idx(rle_numvals + t)] = def_lvl;
         __syncthreads();
         rle_numvals += nrows;
         RleEncode(s, rle_numvals, def_lvl_bits, (rle_numvals == s->page.num_rows), t);
         __syncthreads();
       }
       if (t < 32) {
-        uint8_t* const cur       = s->cur;
-        uint8_t* const rle_out   = s->rle_out;
-        uint32_t const rle_bytes = static_cast<uint32_t>(rle_out - cur) - (is_v2 ? 0 : 4);
-        if (is_v2 && t == 0) {
+        uint8_t* const cur     = s->cur;
+        uint8_t* const rle_out = s->rle_out;
+        // V2 does not write the RLE length field
+        uint32_t const rle_bytes =
+          static_cast<uint32_t>(rle_out - cur) - (is_v2 ? 0 : RLE_LENGTH_FIELD_LEN);
+        if (not is_v2 && t < RLE_LENGTH_FIELD_LEN) { cur[t] = rle_bytes >> (t * 8); }
+        __syncwarp();
+        if (t == 0) {
+          s->cur                = rle_out;
           s->page.def_lvl_bytes = rle_bytes;
-        } else if (not is_v2 && t < 4) {
-          cur[t] = rle_bytes >> (t * 8);
         }
-        __syncwarp();
-        if (t == 0) { s->cur = rle_out; }
       }
     }
   } else if (s->page.page_type != PageType::DICTIONARY_PAGE &&
@@ -1124,29 +1236,122 @@ __global__ void __launch_bounds__(128, 8)
         uint32_t idx         = page_first_val_idx + rle_numvals + t;
         uint32_t lvl_val =
           (rle_numvals + t < s->page.num_values && idx < col_last_val_idx) ? lvl_val_data[idx] : 0;
-        s->vals[(rle_numvals + t) & (rle_buffer_size - 1)] = lvl_val;
+        s->vals[rolling_idx(rle_numvals + t)] = lvl_val;
         __syncthreads();
         rle_numvals += nvals;
         RleEncode(s, rle_numvals, nbits, (rle_numvals == s->page.num_values), t);
         __syncthreads();
       }
       if (t < 32) {
-        uint8_t* const cur       = s->cur;
-        uint8_t* const rle_out   = s->rle_out;
-        uint32_t const rle_bytes = static_cast<uint32_t>(rle_out - cur) - (is_v2 ? 0 : 4);
-        if (is_v2 && t == 0) {
+        uint8_t* const cur     = s->cur;
+        uint8_t* const rle_out = s->rle_out;
+        // V2 does not write the RLE length field
+        uint32_t const rle_bytes =
+          static_cast<uint32_t>(rle_out - cur) - (is_v2 ? 0 : RLE_LENGTH_FIELD_LEN);
+        if (not is_v2 && t < RLE_LENGTH_FIELD_LEN) { cur[t] = rle_bytes >> (t * 8); }
+        __syncwarp();
+        if (t == 0) {
+          s->cur    = rle_out;
           lvl_bytes = rle_bytes;
-        } else if (not is_v2 && t < 4) {
-          cur[t] = rle_bytes >> (t * 8);
         }
-        __syncwarp();
-        if (t == 0) { s->cur = rle_out; }
       }
     };
     encode_levels(s->col.rep_values, s->col.num_rep_level_bits(), s->page.rep_lvl_bytes);
     __syncthreads();
     encode_levels(s->col.def_values, s->col.num_def_level_bits(), s->page.def_lvl_bytes);
   }
+
+  if (t == 0) { pages[blockIdx.x] = s->page; }
+}
+
+template <int block_size, typename state_buf>
+__device__ void finish_page_encode(state_buf* s,
+                                   uint32_t valid_count,
+                                   uint8_t const* end_ptr,
+                                   device_span<EncPage> pages,
+                                   device_span<device_span<uint8_t const>> comp_in,
+                                   device_span<device_span<uint8_t>> comp_out,
+                                   device_span<compression_result> comp_results,
+                                   bool write_v2_headers)
+{
+  auto const t = threadIdx.x;
+
+  // V2 does not compress rep and def level data
+  size_t const skip_comp_size =
+    write_v2_headers ? s->page.def_lvl_bytes + s->page.rep_lvl_bytes : 0;
+
+  if (t == 0) {
+    // only need num_nulls for v2 data page headers
+    if (write_v2_headers) { s->page.num_nulls = s->page.num_values - valid_count; }
+    uint8_t const* const base   = s->page.page_data + s->page.max_hdr_size;
+    auto const actual_data_size = static_cast<uint32_t>(end_ptr - base);
+    if (actual_data_size > s->page.max_data_size) {
+      // FIXME(ets): this needs to do error propagation back to the host
+      CUDF_UNREACHABLE("detected possible page data corruption");
+    }
+    s->page.max_data_size = actual_data_size;
+    if (not comp_in.empty()) {
+      comp_in[blockIdx.x]  = {base + skip_comp_size, actual_data_size - skip_comp_size};
+      comp_out[blockIdx.x] = {s->page.compressed_data + s->page.max_hdr_size + skip_comp_size,
+                              0};  // size is unused
+    }
+    pages[blockIdx.x] = s->page;
+    if (not comp_results.empty()) {
+      comp_results[blockIdx.x]   = {0, compression_status::FAILURE};
+      pages[blockIdx.x].comp_res = &comp_results[blockIdx.x];
+    }
+  }
+
+  // copy uncompressed bytes over
+  if (skip_comp_size != 0 && not comp_in.empty()) {
+    uint8_t* const src = s->page.page_data + s->page.max_hdr_size;
+    uint8_t* const dst = s->page.compressed_data + s->page.max_hdr_size;
+    for (int i = t; i < skip_comp_size; i += block_size) {
+      dst[i] = src[i];
+    }
+  }
+}
+
+// PLAIN page data encoder
+// blockDim(128, 1, 1)
+template <int block_size>
+__global__ void __launch_bounds__(block_size, 8)
+  gpuEncodePages(device_span<EncPage> pages,
+                 device_span<device_span<uint8_t const>> comp_in,
+                 device_span<device_span<uint8_t>> comp_out,
+                 device_span<compression_result> comp_results,
+                 bool write_v2_headers)
+{
+  __shared__ __align__(8) page_enc_state_s<0> state_g;
+  using block_reduce = cub::BlockReduce<uint32_t, block_size>;
+  using block_scan   = cub::BlockScan<uint32_t, block_size>;
+  __shared__ union {
+    typename block_reduce::TempStorage reduce_storage;
+    typename block_scan::TempStorage scan_storage;
+  } temp_storage;
+
+  auto* const s = &state_g;
+  uint32_t t    = threadIdx.x;
+
+  if (t == 0) {
+    state_g        = page_enc_state_s<0>{};
+    s->page        = pages[blockIdx.x];
+    s->ck          = *s->page.chunk;
+    s->col         = *s->ck.col_desc;
+    s->rle_len_pos = nullptr;
+    // get s->cur back to where it was at the end of encoding the rep and def level data
+    s->cur =
+      s->page.page_data + s->page.max_hdr_size + s->page.def_lvl_bytes + s->page.rep_lvl_bytes;
+    // if V1 data page, need space for the RLE length fields
+    if (s->page.page_type == PageType::DATA_PAGE) {
+      if (s->col.num_def_level_bits() != 0) { s->cur += RLE_LENGTH_FIELD_LEN; }
+      if (s->col.num_rep_level_bits() != 0) { s->cur += RLE_LENGTH_FIELD_LEN; }
+    }
+  }
+  __syncthreads();
+
+  if (BitAnd(s->page.kernel_mask, encode_kernel_mask::PLAIN) == 0) { return; }
+
   // Encode data values
   __syncthreads();
   auto const physical_type = s->col.physical_type;
@@ -1158,10 +1363,6 @@ __global__ void __launch_bounds__(128, 8)
     return dtype_len_out;
   }();
 
-  auto const dict_bits = (physical_type == BOOLEAN) ? 1
-                         : (s->ck.use_dictionary and s->page.page_type != PageType::DICTIONARY_PAGE)
-                           ? s->ck.dict_rle_bits
-                           : -1;
   if (t == 0) {
     uint8_t* dst   = s->cur;
     s->rle_run     = 0;
@@ -1170,219 +1371,315 @@ __global__ void __launch_bounds__(128, 8)
     s->rle_out     = dst;
     s->page.encoding =
       determine_encoding(s->page.page_type, physical_type, s->ck.use_dictionary, write_v2_headers);
-    if (dict_bits >= 0 && physical_type != BOOLEAN) {
-      dst[0]     = dict_bits;
-      s->rle_out = dst + 1;
-    } else if (is_v2 && physical_type == BOOLEAN) {
-      // save space for RLE length. we don't know the total length yet.
-      s->rle_out     = dst + RLE_LENGTH_FIELD_LEN;
-      s->rle_len_pos = dst;
-    }
     s->page_start_val  = row_to_value_idx(s->page.start_row, s->col);
     s->chunk_start_val = row_to_value_idx(s->ck.start_row, s->col);
   }
   __syncthreads();
+
   uint32_t num_valid = 0;
   for (uint32_t cur_val_idx = 0; cur_val_idx < s->page.num_leaf_values;) {
-    uint32_t nvals = min(s->page.num_leaf_values - cur_val_idx, 128);
+    uint32_t nvals = min(s->page.num_leaf_values - cur_val_idx, block_size);
     uint32_t len, pos;
 
     auto [is_valid, val_idx] = [&]() {
       uint32_t val_idx;
       uint32_t is_valid;
 
-      size_type val_idx_in_block = cur_val_idx + t;
+      size_type const val_idx_in_block = cur_val_idx + t;
       if (s->page.page_type == PageType::DICTIONARY_PAGE) {
         val_idx  = val_idx_in_block;
         is_valid = (val_idx < s->page.num_leaf_values);
         if (is_valid) { val_idx = s->ck.dict_data[val_idx]; }
       } else {
-        size_type val_idx_in_leaf_col = s->page_start_val + val_idx_in_block;
+        size_type const val_idx_in_leaf_col = s->page_start_val + val_idx_in_block;
 
         is_valid = (val_idx_in_leaf_col < s->col.leaf_column->size() &&
                     val_idx_in_block < s->page.num_leaf_values)
                      ? s->col.leaf_column->is_valid(val_idx_in_leaf_col)
                      : 0;
-        val_idx =
-          (s->ck.use_dictionary) ? val_idx_in_leaf_col - s->chunk_start_val : val_idx_in_leaf_col;
+        val_idx  = val_idx_in_leaf_col;
       }
       return std::make_tuple(is_valid, val_idx);
     }();
 
-    if (is_valid) num_valid++;
-
+    if (is_valid) { num_valid++; }
     cur_val_idx += nvals;
-    if (dict_bits >= 0) {
-      // Dictionary encoding
-      if (dict_bits > 0) {
-        uint32_t rle_numvals;
-        uint32_t rle_numvals_in_block;
-        block_scan(temp_storage.scan_storage).ExclusiveSum(is_valid, pos, rle_numvals_in_block);
-        rle_numvals = s->rle_numvals;
-        if (is_valid) {
-          uint32_t v;
-          if (physical_type == BOOLEAN) {
-            v = s->col.leaf_column->element<uint8_t>(val_idx);
-          } else {
-            v = s->ck.dict_index[val_idx];
-          }
-          s->vals[(rle_numvals + pos) & (rle_buffer_size - 1)] = v;
-        }
-        rle_numvals += rle_numvals_in_block;
-        __syncthreads();
-        if (!is_v2 && physical_type == BOOLEAN) {
-          PlainBoolEncode(s, rle_numvals, (cur_val_idx == s->page.num_leaf_values), t);
-        } else {
-          RleEncode(s, rle_numvals, dict_bits, (cur_val_idx == s->page.num_leaf_values), t);
+
+    // Non-dictionary encoding
+    uint8_t* dst = s->cur;
+
+    if (is_valid) {
+      len = dtype_len_out;
+      if (physical_type == BYTE_ARRAY) {
+        if (type_id == type_id::STRING) {
+          len += s->col.leaf_column->element<string_view>(val_idx).size_bytes();
+        } else if (s->col.output_as_byte_array && type_id == type_id::LIST) {
+          len +=
+            get_element<statistics::byte_array_view>(*s->col.leaf_column, val_idx).size_bytes();
         }
-        __syncthreads();
       }
-      if (t == 0) { s->cur = s->rle_out; }
-      __syncthreads();
     } else {
-      // Non-dictionary encoding
-      uint8_t* dst = s->cur;
-
-      if (is_valid) {
-        len = dtype_len_out;
-        if (physical_type == BYTE_ARRAY) {
-          if (type_id == type_id::STRING) {
-            len += s->col.leaf_column->element<string_view>(val_idx).size_bytes();
-          } else if (s->col.output_as_byte_array && type_id == type_id::LIST) {
-            len +=
-              get_element<statistics::byte_array_view>(*s->col.leaf_column, val_idx).size_bytes();
+      len = 0;
+    }
+    uint32_t total_len = 0;
+    block_scan(temp_storage.scan_storage).ExclusiveSum(len, pos, total_len);
+    __syncthreads();
+    if (t == 0) { s->cur = dst + total_len; }
+    if (is_valid) {
+      switch (physical_type) {
+        case INT32: [[fallthrough]];
+        case FLOAT: {
+          auto const v = [dtype_len = dtype_len_in,
+                          idx       = val_idx,
+                          col       = s->col.leaf_column,
+                          scale     = s->col.ts_scale == 0 ? 1 : s->col.ts_scale]() -> int32_t {
+            switch (dtype_len) {
+              case 8: return col->element<int64_t>(idx) * scale;
+              case 4: return col->element<int32_t>(idx) * scale;
+              case 2: return col->element<int16_t>(idx) * scale;
+              default: return col->element<int8_t>(idx) * scale;
+            }
+          }();
+
+          dst[pos + 0] = v;
+          dst[pos + 1] = v >> 8;
+          dst[pos + 2] = v >> 16;
+          dst[pos + 3] = v >> 24;
+        } break;
+        case INT64: {
+          int64_t v        = s->col.leaf_column->element<int64_t>(val_idx);
+          int32_t ts_scale = s->col.ts_scale;
+          if (ts_scale != 0) {
+            if (ts_scale < 0) {
+              v /= -ts_scale;
+            } else {
+              v *= ts_scale;
+            }
+          }
+          dst[pos + 0] = v;
+          dst[pos + 1] = v >> 8;
+          dst[pos + 2] = v >> 16;
+          dst[pos + 3] = v >> 24;
+          dst[pos + 4] = v >> 32;
+          dst[pos + 5] = v >> 40;
+          dst[pos + 6] = v >> 48;
+          dst[pos + 7] = v >> 56;
+        } break;
+        case INT96: {
+          int64_t v        = s->col.leaf_column->element<int64_t>(val_idx);
+          int32_t ts_scale = s->col.ts_scale;
+          if (ts_scale != 0) {
+            if (ts_scale < 0) {
+              v /= -ts_scale;
+            } else {
+              v *= ts_scale;
+            }
           }
-        }
-      } else {
-        len = 0;
-      }
-      uint32_t total_len = 0;
-      block_scan(temp_storage.scan_storage).ExclusiveSum(len, pos, total_len);
-      __syncthreads();
-      if (t == 0) { s->cur = dst + total_len; }
-      if (is_valid) {
-        switch (physical_type) {
-          case INT32: [[fallthrough]];
-          case FLOAT: {
-            auto const v = [dtype_len = dtype_len_in,
-                            idx       = val_idx,
-                            col       = s->col.leaf_column,
-                            scale     = s->col.ts_scale == 0 ? 1 : s->col.ts_scale]() -> int32_t {
-              switch (dtype_len) {
-                case 8: return col->element<int64_t>(idx) * scale;
-                case 4: return col->element<int32_t>(idx) * scale;
-                case 2: return col->element<int16_t>(idx) * scale;
-                default: return col->element<int8_t>(idx) * scale;
-              }
-            }();
 
-            dst[pos + 0] = v;
-            dst[pos + 1] = v >> 8;
-            dst[pos + 2] = v >> 16;
-            dst[pos + 3] = v >> 24;
-          } break;
-          case INT64: {
-            int64_t v        = s->col.leaf_column->element<int64_t>(val_idx);
-            int32_t ts_scale = s->col.ts_scale;
-            if (ts_scale != 0) {
-              if (ts_scale < 0) {
-                v /= -ts_scale;
-              } else {
-                v *= ts_scale;
-              }
+          auto const [last_day_nanos, julian_days] = [&] {
+            using namespace cuda::std::chrono;
+            switch (s->col.leaf_column->type().id()) {
+              case type_id::TIMESTAMP_SECONDS:
+              case type_id::TIMESTAMP_MILLISECONDS: {
+                return julian_days_with_time<cuda::std::milli>(v);
+              } break;
+              case type_id::TIMESTAMP_MICROSECONDS:
+              case type_id::TIMESTAMP_NANOSECONDS: {
+                return julian_days_with_time<cuda::std::micro>(v);
+              } break;
             }
-            dst[pos + 0] = v;
-            dst[pos + 1] = v >> 8;
-            dst[pos + 2] = v >> 16;
-            dst[pos + 3] = v >> 24;
-            dst[pos + 4] = v >> 32;
-            dst[pos + 5] = v >> 40;
-            dst[pos + 6] = v >> 48;
-            dst[pos + 7] = v >> 56;
-          } break;
-          case INT96: {
-            int64_t v        = s->col.leaf_column->element<int64_t>(val_idx);
-            int32_t ts_scale = s->col.ts_scale;
-            if (ts_scale != 0) {
-              if (ts_scale < 0) {
-                v /= -ts_scale;
-              } else {
-                v *= ts_scale;
-              }
+            return julian_days_with_time<cuda::std::nano>(0);
+          }();
+
+          // the 12 bytes of fixed length data.
+          v             = last_day_nanos.count();
+          dst[pos + 0]  = v;
+          dst[pos + 1]  = v >> 8;
+          dst[pos + 2]  = v >> 16;
+          dst[pos + 3]  = v >> 24;
+          dst[pos + 4]  = v >> 32;
+          dst[pos + 5]  = v >> 40;
+          dst[pos + 6]  = v >> 48;
+          dst[pos + 7]  = v >> 56;
+          uint32_t w    = julian_days.count();
+          dst[pos + 8]  = w;
+          dst[pos + 9]  = w >> 8;
+          dst[pos + 10] = w >> 16;
+          dst[pos + 11] = w >> 24;
+        } break;
+
+        case DOUBLE: {
+          auto v = s->col.leaf_column->element<double>(val_idx);
+          memcpy(dst + pos, &v, 8);
+        } break;
+        case BYTE_ARRAY: {
+          auto const bytes = [](cudf::type_id const type_id,
+                                column_device_view const* leaf_column,
+                                uint32_t const val_idx) -> void const* {
+            switch (type_id) {
+              case type_id::STRING:
+                return reinterpret_cast<void const*>(
+                  leaf_column->element<string_view>(val_idx).data());
+              case type_id::LIST:
+                return reinterpret_cast<void const*>(
+                  get_element<statistics::byte_array_view>(*(leaf_column), val_idx).data());
+              default: CUDF_UNREACHABLE("invalid type id for byte array writing!");
             }
+          }(type_id, s->col.leaf_column, val_idx);
+          uint32_t v   = len - 4;  // string length
+          dst[pos + 0] = v;
+          dst[pos + 1] = v >> 8;
+          dst[pos + 2] = v >> 16;
+          dst[pos + 3] = v >> 24;
+          if (v != 0) memcpy(dst + pos + 4, bytes, v);
+        } break;
+        case FIXED_LEN_BYTE_ARRAY: {
+          if (type_id == type_id::DECIMAL128) {
+            // When using FIXED_LEN_BYTE_ARRAY for decimals, the rep is encoded in big-endian
+            auto const v = s->col.leaf_column->element<numeric::decimal128>(val_idx).value();
+            auto const v_char_ptr = reinterpret_cast<char const*>(&v);
+            thrust::copy(thrust::seq,
+                         thrust::make_reverse_iterator(v_char_ptr + sizeof(v)),
+                         thrust::make_reverse_iterator(v_char_ptr),
+                         dst + pos);
+          }
+        } break;
+      }
+    }
+    __syncthreads();
+  }
 
-            auto const [last_day_nanos, julian_days] = [&] {
-              using namespace cuda::std::chrono;
-              switch (s->col.leaf_column->type().id()) {
-                case type_id::TIMESTAMP_SECONDS:
-                case type_id::TIMESTAMP_MILLISECONDS: {
-                  return julian_days_with_time<cuda::std::milli>(v);
-                } break;
-                case type_id::TIMESTAMP_MICROSECONDS:
-                case type_id::TIMESTAMP_NANOSECONDS: {
-                  return julian_days_with_time<cuda::std::micro>(v);
-                } break;
-              }
-              return julian_days_with_time<cuda::std::nano>(0);
-            }();
-
-            // the 12 bytes of fixed length data.
-            v             = last_day_nanos.count();
-            dst[pos + 0]  = v;
-            dst[pos + 1]  = v >> 8;
-            dst[pos + 2]  = v >> 16;
-            dst[pos + 3]  = v >> 24;
-            dst[pos + 4]  = v >> 32;
-            dst[pos + 5]  = v >> 40;
-            dst[pos + 6]  = v >> 48;
-            dst[pos + 7]  = v >> 56;
-            uint32_t w    = julian_days.count();
-            dst[pos + 8]  = w;
-            dst[pos + 9]  = w >> 8;
-            dst[pos + 10] = w >> 16;
-            dst[pos + 11] = w >> 24;
-          } break;
+  uint32_t const valid_count = block_reduce(temp_storage.reduce_storage).Sum(num_valid);
 
-          case DOUBLE: {
-            auto v = s->col.leaf_column->element<double>(val_idx);
-            memcpy(dst + pos, &v, 8);
-          } break;
-          case BYTE_ARRAY: {
-            auto const bytes = [](cudf::type_id const type_id,
-                                  column_device_view const* leaf_column,
-                                  uint32_t const val_idx) -> void const* {
-              switch (type_id) {
-                case type_id::STRING:
-                  return reinterpret_cast<void const*>(
-                    leaf_column->element<string_view>(val_idx).data());
-                case type_id::LIST:
-                  return reinterpret_cast<void const*>(
-                    get_element<statistics::byte_array_view>(*(leaf_column), val_idx).data());
-                default: CUDF_UNREACHABLE("invalid type id for byte array writing!");
-              }
-            }(type_id, s->col.leaf_column, val_idx);
-            uint32_t v   = len - 4;  // string length
-            dst[pos + 0] = v;
-            dst[pos + 1] = v >> 8;
-            dst[pos + 2] = v >> 16;
-            dst[pos + 3] = v >> 24;
-            if (v != 0) memcpy(dst + pos + 4, bytes, v);
-          } break;
-          case FIXED_LEN_BYTE_ARRAY: {
-            if (type_id == type_id::DECIMAL128) {
-              // When using FIXED_LEN_BYTE_ARRAY for decimals, the rep is encoded in big-endian
-              auto const v = s->col.leaf_column->element<numeric::decimal128>(val_idx).value();
-              auto const v_char_ptr = reinterpret_cast<char const*>(&v);
-              thrust::copy(thrust::seq,
-                           thrust::make_reverse_iterator(v_char_ptr + sizeof(v)),
-                           thrust::make_reverse_iterator(v_char_ptr),
-                           dst + pos);
-            }
-          } break;
+  finish_page_encode<block_size>(
+    s, valid_count, s->cur, pages, comp_in, comp_out, comp_results, write_v2_headers);
+}
+
+// DICTIONARY page data encoder
+// blockDim(128, 1, 1)
+template <int block_size>
+__global__ void __launch_bounds__(block_size, 8)
+  gpuEncodeDictPages(device_span<EncPage> pages,
+                     device_span<device_span<uint8_t const>> comp_in,
+                     device_span<device_span<uint8_t>> comp_out,
+                     device_span<compression_result> comp_results,
+                     bool write_v2_headers)
+{
+  __shared__ __align__(8) rle_page_enc_state_s state_g;
+  using block_reduce = cub::BlockReduce<uint32_t, block_size>;
+  using block_scan   = cub::BlockScan<uint32_t, block_size>;
+  __shared__ union {
+    typename block_reduce::TempStorage reduce_storage;
+    typename block_scan::TempStorage scan_storage;
+  } temp_storage;
+
+  auto* const s = &state_g;
+  uint32_t t    = threadIdx.x;
+
+  if (t == 0) {
+    state_g        = rle_page_enc_state_s{};
+    s->page        = pages[blockIdx.x];
+    s->ck          = *s->page.chunk;
+    s->col         = *s->ck.col_desc;
+    s->rle_len_pos = nullptr;
+    // get s->cur back to where it was at the end of encoding the rep and def level data
+    s->cur =
+      s->page.page_data + s->page.max_hdr_size + s->page.def_lvl_bytes + s->page.rep_lvl_bytes;
+    // if V1 data page, need space for the RLE length fields
+    if (s->page.page_type == PageType::DATA_PAGE) {
+      if (s->col.num_def_level_bits() != 0) { s->cur += RLE_LENGTH_FIELD_LEN; }
+      if (s->col.num_rep_level_bits() != 0) { s->cur += RLE_LENGTH_FIELD_LEN; }
+    }
+  }
+  __syncthreads();
+
+  if (BitAnd(s->page.kernel_mask, encode_kernel_mask::DICTIONARY) == 0) { return; }
+
+  // Encode data values
+  __syncthreads();
+  auto const physical_type = s->col.physical_type;
+  auto const type_id       = s->col.leaf_column->type().id();
+  auto const dtype_len_out = physical_type_len(physical_type, type_id);
+  auto const dtype_len_in  = [&]() -> uint32_t {
+    if (physical_type == INT32) { return int32_logical_len(type_id); }
+    if (physical_type == INT96) { return sizeof(int64_t); }
+    return dtype_len_out;
+  }();
+
+  // TODO assert dict_bits >= 0
+  auto const dict_bits = (physical_type == BOOLEAN) ? 1
+                         : (s->ck.use_dictionary and s->page.page_type != PageType::DICTIONARY_PAGE)
+                           ? s->ck.dict_rle_bits
+                           : -1;
+  if (t == 0) {
+    uint8_t* dst   = s->cur;
+    s->rle_run     = 0;
+    s->rle_pos     = 0;
+    s->rle_numvals = 0;
+    s->rle_out     = dst;
+    s->page.encoding =
+      determine_encoding(s->page.page_type, physical_type, s->ck.use_dictionary, write_v2_headers);
+    if (dict_bits >= 0 && physical_type != BOOLEAN) {
+      dst[0]     = dict_bits;
+      s->rle_out = dst + 1;
+    } else if (write_v2_headers && physical_type == BOOLEAN) {
+      // save space for RLE length. we don't know the total length yet.
+      s->rle_out     = dst + RLE_LENGTH_FIELD_LEN;
+      s->rle_len_pos = dst;
+    }
+    s->page_start_val  = row_to_value_idx(s->page.start_row, s->col);
+    s->chunk_start_val = row_to_value_idx(s->ck.start_row, s->col);
+  }
+  __syncthreads();
+
+  uint32_t num_valid = 0;
+  for (uint32_t cur_val_idx = 0; cur_val_idx < s->page.num_leaf_values;) {
+    uint32_t nvals = min(s->page.num_leaf_values - cur_val_idx, block_size);
+
+    auto [is_valid, val_idx] = [&]() {
+      size_type const val_idx_in_block    = cur_val_idx + t;
+      size_type const val_idx_in_leaf_col = s->page_start_val + val_idx_in_block;
+
+      uint32_t const is_valid = (val_idx_in_leaf_col < s->col.leaf_column->size() &&
+                                 val_idx_in_block < s->page.num_leaf_values)
+                                  ? s->col.leaf_column->is_valid(val_idx_in_leaf_col)
+                                  : 0;
+      // need to test for use_dictionary because it might be boolean
+      uint32_t const val_idx =
+        (s->ck.use_dictionary) ? val_idx_in_leaf_col - s->chunk_start_val : val_idx_in_leaf_col;
+      return std::make_tuple(is_valid, val_idx);
+    }();
+
+    if (is_valid) { num_valid++; }
+    cur_val_idx += nvals;
+
+    // Dictionary encoding
+    if (dict_bits > 0) {
+      uint32_t rle_numvals;
+      uint32_t rle_numvals_in_block;
+      uint32_t pos;
+      block_scan(temp_storage.scan_storage).ExclusiveSum(is_valid, pos, rle_numvals_in_block);
+      rle_numvals = s->rle_numvals;
+      if (is_valid) {
+        uint32_t v;
+        if (physical_type == BOOLEAN) {
+          v = s->col.leaf_column->element<uint8_t>(val_idx);
+        } else {
+          v = s->ck.dict_index[val_idx];
         }
+        s->vals[rolling_idx(rle_numvals + pos)] = v;
+      }
+      rle_numvals += rle_numvals_in_block;
+      __syncthreads();
+      if ((!write_v2_headers) && (physical_type == BOOLEAN)) {
+        PlainBoolEncode(s, rle_numvals, (cur_val_idx == s->page.num_leaf_values), t);
+      } else {
+        RleEncode(s, rle_numvals, dict_bits, (cur_val_idx == s->page.num_leaf_values), t);
       }
       __syncthreads();
     }
+    if (t == 0) { s->cur = s->rle_out; }
+    __syncthreads();
   }
 
   uint32_t const valid_count = block_reduce(temp_storage.reduce_storage).Sum(num_valid);
@@ -1395,37 +1692,137 @@ __global__ void __launch_bounds__(128, 8)
     __syncwarp();
   }
 
-  // V2 does not compress rep and def level data
-  size_t const skip_comp_size = s->page.def_lvl_bytes + s->page.rep_lvl_bytes;
+  finish_page_encode<block_size>(
+    s, valid_count, s->cur, pages, comp_in, comp_out, comp_results, write_v2_headers);
+}
+
+// DELTA_BINARY_PACKED page data encoder
+// blockDim(128, 1, 1)
+template <int block_size>
+__global__ void __launch_bounds__(block_size, 8)
+  gpuEncodeDeltaBinaryPages(device_span<EncPage> pages,
+                            device_span<device_span<uint8_t const>> comp_in,
+                            device_span<device_span<uint8_t>> comp_out,
+                            device_span<compression_result> comp_results)
+{
+  // block of shared memory for value storage and bit packing
+  __shared__ uleb128_t delta_shared[delta::buffer_size + delta::block_size];
+  __shared__ __align__(8) page_enc_state_s<0> state_g;
+  using block_reduce = cub::BlockReduce<uint32_t, block_size>;
+  __shared__ union {
+    typename block_reduce::TempStorage reduce_storage;
+    typename delta::index_scan::TempStorage delta_index_tmp;
+    typename delta::block_reduce::TempStorage delta_reduce_tmp;
+    typename delta::warp_reduce::TempStorage delta_warp_red_tmp[delta::num_mini_blocks];
+  } temp_storage;
+
+  auto* const s = &state_g;
+  uint32_t t    = threadIdx.x;
 
   if (t == 0) {
-    s->page.num_nulls           = s->page.num_values - valid_count;
-    uint8_t* const base         = s->page.page_data + s->page.max_hdr_size;
-    auto const actual_data_size = static_cast<uint32_t>(s->cur - base);
-    if (actual_data_size > s->page.max_data_size) {
-      CUDF_UNREACHABLE("detected possible page data corruption");
-    }
-    s->page.max_data_size = actual_data_size;
-    if (not comp_in.empty()) {
-      comp_in[blockIdx.x]  = {base + skip_comp_size, actual_data_size - skip_comp_size};
-      comp_out[blockIdx.x] = {s->page.compressed_data + s->page.max_hdr_size + skip_comp_size,
-                              0};  // size is unused
-    }
-    pages[blockIdx.x] = s->page;
-    if (not comp_results.empty()) {
-      comp_results[blockIdx.x]   = {0, compression_status::FAILURE};
-      pages[blockIdx.x].comp_res = &comp_results[blockIdx.x];
+    state_g        = page_enc_state_s<0>{};
+    s->page        = pages[blockIdx.x];
+    s->ck          = *s->page.chunk;
+    s->col         = *s->ck.col_desc;
+    s->rle_len_pos = nullptr;
+    // get s->cur back to where it was at the end of encoding the rep and def level data
+    s->cur =
+      s->page.page_data + s->page.max_hdr_size + s->page.def_lvl_bytes + s->page.rep_lvl_bytes;
+  }
+  __syncthreads();
+
+  if (BitAnd(s->page.kernel_mask, encode_kernel_mask::DELTA_BINARY) == 0) { return; }
+
+  // Encode data values
+  __syncthreads();
+  auto const physical_type = s->col.physical_type;
+  auto const type_id       = s->col.leaf_column->type().id();
+  auto const dtype_len_out = physical_type_len(physical_type, type_id);
+  auto const dtype_len_in  = [&]() -> uint32_t {
+    if (physical_type == INT32) { return int32_logical_len(type_id); }
+    if (physical_type == INT96) { return sizeof(int64_t); }
+    return dtype_len_out;
+  }();
+
+  if (t == 0) {
+    uint8_t* dst       = s->cur;
+    s->rle_run         = 0;
+    s->rle_pos         = 0;
+    s->rle_numvals     = 0;
+    s->rle_out         = dst;
+    s->page.encoding   = Encoding::DELTA_BINARY_PACKED;
+    s->page_start_val  = row_to_value_idx(s->page.start_row, s->col);
+    s->chunk_start_val = row_to_value_idx(s->ck.start_row, s->col);
+  }
+  __syncthreads();
+
+  // need to know the number of valid values for the null values calculation and to size
+  // the delta binary encoder.
+  uint32_t valid_count = 0;
+  if (not s->col.leaf_column->nullable()) {
+    valid_count = s->page.num_leaf_values;
+  } else {
+    uint32_t num_valid = 0;
+    for (uint32_t cur_val_idx = 0; cur_val_idx < s->page.num_leaf_values;) {
+      uint32_t const nvals                = min(s->page.num_leaf_values - cur_val_idx, block_size);
+      size_type const val_idx_in_block    = cur_val_idx + t;
+      size_type const val_idx_in_leaf_col = s->page_start_val + val_idx_in_block;
+
+      if (val_idx_in_leaf_col < s->col.leaf_column->size() &&
+          val_idx_in_block < s->page.num_leaf_values &&
+          s->col.leaf_column->is_valid(val_idx_in_leaf_col)) {
+        num_valid++;
+      }
+      cur_val_idx += nvals;
     }
+    valid_count = block_reduce(temp_storage.reduce_storage).Sum(num_valid);
   }
 
-  // copy over uncompressed data
-  if (skip_comp_size != 0 && not comp_in.empty()) {
-    uint8_t const* const src = s->page.page_data + s->page.max_hdr_size;
-    uint8_t* const dst       = s->page.compressed_data + s->page.max_hdr_size;
-    for (int i = t; i < skip_comp_size; i += block_size) {
-      dst[i] = src[i];
+  uint8_t const* delta_ptr = nullptr;  // this will be the end of delta block pointer
+
+  if (physical_type == INT32) {
+    switch (dtype_len_in) {
+      case 8: {
+        // only DURATIONS map to 8 bytes, so safe to just use signed here?
+        delta_ptr = delta_encode<int64_t>(s, valid_count, delta_shared, &temp_storage);
+        break;
+      }
+      case 4: {
+        if (type_id == type_id::UINT32) {
+          delta_ptr = delta_encode<uint32_t>(s, valid_count, delta_shared, &temp_storage);
+        } else {
+          delta_ptr = delta_encode<int32_t>(s, valid_count, delta_shared, &temp_storage);
+        }
+        break;
+      }
+      case 2: {
+        if (type_id == type_id::UINT16) {
+          delta_ptr = delta_encode<uint16_t>(s, valid_count, delta_shared, &temp_storage);
+        } else {
+          delta_ptr = delta_encode<int16_t>(s, valid_count, delta_shared, &temp_storage);
+        }
+        break;
+      }
+      case 1: {
+        if (type_id == type_id::UINT8) {
+          delta_ptr = delta_encode<uint8_t>(s, valid_count, delta_shared, &temp_storage);
+        } else {
+          delta_ptr = delta_encode<int8_t>(s, valid_count, delta_shared, &temp_storage);
+        }
+        break;
+      }
+      default: CUDF_UNREACHABLE("invalid dtype_len_in when encoding DELTA_BINARY_PACKED");
+    }
+  } else {
+    if (type_id == type_id::UINT64) {
+      delta_ptr = delta_encode<uint64_t>(s, valid_count, delta_shared, &temp_storage);
+    } else {
+      delta_ptr = delta_encode<int64_t>(s, valid_count, delta_shared, &temp_storage);
     }
   }
+
+  finish_page_encode<block_size>(
+    s, valid_count, delta_ptr, pages, comp_in, comp_out, comp_results, true);
 }
 
 constexpr int decide_compression_warps_in_block = 4;
@@ -1460,7 +1857,8 @@ __global__ void __launch_bounds__(decide_compression_block_size)
   for (auto page_id = lane_id; page_id < num_pages; page_id += cudf::detail::warp_size) {
     auto const& curr_page     = ck_g[warp_id].pages[page_id];
     auto const page_data_size = curr_page.max_data_size;
-    auto const lvl_bytes      = curr_page.def_lvl_bytes + curr_page.rep_lvl_bytes;
+    auto const is_v2          = curr_page.page_type == PageType::DATA_PAGE_V2;
+    auto const lvl_bytes      = is_v2 ? curr_page.def_lvl_bytes + curr_page.rep_lvl_bytes : 0;
     uncompressed_data_size += page_data_size;
     if (auto comp_res = curr_page.comp_res; comp_res != nullptr) {
       compressed_data_size += comp_res->bytes_written + lvl_bytes;
@@ -1923,7 +2321,8 @@ __global__ void __launch_bounds__(128)
     }
     uncompressed_page_size = page_g.max_data_size;
     if (ck_g.is_compressed) {
-      auto const lvl_bytes = page_g.def_lvl_bytes + page_g.rep_lvl_bytes;
+      auto const is_v2     = page_g.page_type == PageType::DATA_PAGE_V2;
+      auto const lvl_bytes = is_v2 ? page_g.def_lvl_bytes + page_g.rep_lvl_bytes : 0;
       hdr_start            = page_g.compressed_data;
       compressed_page_size =
         static_cast<uint32_t>(comp_results[blockIdx.x].bytes_written) + lvl_bytes;
@@ -1988,7 +2387,7 @@ __global__ void __launch_bounds__(128)
 
 // blockDim(1024, 1, 1)
 __global__ void __launch_bounds__(1024)
-  gpuGatherPages(device_span<EncColumnChunk> chunks, device_span<gpu::EncPage const> pages)
+  gpuGatherPages(device_span<EncColumnChunk> chunks, device_span<EncPage const> pages)
 {
   __shared__ __align__(8) EncColumnChunk ck_g;
   __shared__ __align__(8) EncPage page_g;
@@ -2158,6 +2557,10 @@ constexpr __device__ void* align8(void* ptr)
   return static_cast<char*>(ptr) - algn;
 }
 
+struct mask_tform {
+  __device__ uint32_t operator()(EncPage const& p) { return static_cast<uint32_t>(p.kernel_mask); }
+};
+
 }  // namespace
 
 // blockDim(1, 1, 1)
@@ -2260,12 +2663,13 @@ void InitFragmentStatistics(device_span<statistics_group> groups,
                             rmm::cuda_stream_view stream)
 {
   int const num_fragments = fragments.size();
-  int const dim = util::div_rounding_up_safe(num_fragments, 128 / cudf::detail::warp_size);
-  gpuInitFragmentStats<<<dim, 128, 0, stream.value()>>>(groups, fragments);
+  int const dim =
+    util::div_rounding_up_safe(num_fragments, encode_block_size / cudf::detail::warp_size);
+  gpuInitFragmentStats<<<dim, encode_block_size, 0, stream.value()>>>(groups, fragments);
 }
 
 void InitEncoderPages(device_2dspan<EncColumnChunk> chunks,
-                      device_span<gpu::EncPage> pages,
+                      device_span<EncPage> pages,
                       device_span<size_type> page_sizes,
                       device_span<size_type> comp_page_sizes,
                       device_span<parquet_column_device_view const> col_desc,
@@ -2280,21 +2684,21 @@ void InitEncoderPages(device_2dspan<EncColumnChunk> chunks,
 {
   auto num_rowgroups = chunks.size().first;
   dim3 dim_grid(num_columns, num_rowgroups);  // 1 threadblock per rowgroup
-  gpuInitPages<<<dim_grid, 128, 0, stream.value()>>>(chunks,
-                                                     pages,
-                                                     page_sizes,
-                                                     comp_page_sizes,
-                                                     col_desc,
-                                                     page_grstats,
-                                                     chunk_grstats,
-                                                     num_columns,
-                                                     max_page_size_bytes,
-                                                     max_page_size_rows,
-                                                     page_align,
-                                                     write_v2_headers);
+  gpuInitPages<<<dim_grid, encode_block_size, 0, stream.value()>>>(chunks,
+                                                                   pages,
+                                                                   page_sizes,
+                                                                   comp_page_sizes,
+                                                                   col_desc,
+                                                                   page_grstats,
+                                                                   chunk_grstats,
+                                                                   num_columns,
+                                                                   max_page_size_bytes,
+                                                                   max_page_size_rows,
+                                                                   page_align,
+                                                                   write_v2_headers);
 }
 
-void EncodePages(device_span<gpu::EncPage> pages,
+void EncodePages(device_span<EncPage> pages,
                  bool write_v2_headers,
                  device_span<device_span<uint8_t const>> comp_in,
                  device_span<device_span<uint8_t>> comp_out,
@@ -2302,10 +2706,43 @@ void EncodePages(device_span<gpu::EncPage> pages,
                  rmm::cuda_stream_view stream)
 {
   auto num_pages = pages.size();
+
+  // determine which kernels to invoke
+  auto mask_iter       = thrust::make_transform_iterator(pages.begin(), mask_tform{});
+  uint32_t kernel_mask = thrust::reduce(
+    rmm::exec_policy(stream), mask_iter, mask_iter + pages.size(), 0U, thrust::bit_or<uint32_t>{});
+
+  // get the number of streams we need from the pool
+  int nkernels = std::bitset<32>(kernel_mask).count();
+  auto streams = cudf::detail::fork_streams(stream, nkernels);
+
   // A page is part of one column. This is launching 1 block per page. 1 block will exclusively
   // deal with one datatype.
-  gpuEncodePages<128><<<num_pages, 128, 0, stream.value()>>>(
-    pages, comp_in, comp_out, comp_results, write_v2_headers);
+
+  int s_idx = 0;
+  if (BitAnd(kernel_mask, encode_kernel_mask::PLAIN) != 0) {
+    auto const strm = streams[s_idx++];
+    gpuEncodePageLevels<encode_block_size><<<num_pages, encode_block_size, 0, strm.value()>>>(
+      pages, write_v2_headers, encode_kernel_mask::PLAIN);
+    gpuEncodePages<encode_block_size><<<num_pages, encode_block_size, 0, strm.value()>>>(
+      pages, comp_in, comp_out, comp_results, write_v2_headers);
+  }
+  if (BitAnd(kernel_mask, encode_kernel_mask::DELTA_BINARY) != 0) {
+    auto const strm = streams[s_idx++];
+    gpuEncodePageLevels<encode_block_size><<<num_pages, encode_block_size, 0, strm.value()>>>(
+      pages, write_v2_headers, encode_kernel_mask::DELTA_BINARY);
+    gpuEncodeDeltaBinaryPages<encode_block_size>
+      <<<num_pages, encode_block_size, 0, strm.value()>>>(pages, comp_in, comp_out, comp_results);
+  }
+  if (BitAnd(kernel_mask, encode_kernel_mask::DICTIONARY) != 0) {
+    auto const strm = streams[s_idx++];
+    gpuEncodePageLevels<encode_block_size><<<num_pages, encode_block_size, 0, strm.value()>>>(
+      pages, write_v2_headers, encode_kernel_mask::DICTIONARY);
+    gpuEncodeDictPages<encode_block_size><<<num_pages, encode_block_size, 0, strm.value()>>>(
+      pages, comp_in, comp_out, comp_results, write_v2_headers);
+  }
+
+  cudf::detail::join_streams(streams, stream);
 }
 
 void DecideCompression(device_span<EncColumnChunk> chunks, rmm::cuda_stream_view stream)
@@ -2323,12 +2760,12 @@ void EncodePageHeaders(device_span<EncPage> pages,
 {
   // TODO: single thread task. No need for 128 threads/block. Earlier it used to employ rest of the
   // threads to coop load structs
-  gpuEncodePageHeaders<<<pages.size(), 128, 0, stream.value()>>>(
+  gpuEncodePageHeaders<<<pages.size(), encode_block_size, 0, stream.value()>>>(
     pages, comp_results, page_stats, chunk_stats);
 }
 
 void GatherPages(device_span<EncColumnChunk> chunks,
-                 device_span<gpu::EncPage const> pages,
+                 device_span<EncPage const> pages,
                  rmm::cuda_stream_view stream)
 {
   gpuGatherPages<<<chunks.size(), 1024, 0, stream.value()>>>(chunks, pages);
@@ -2343,7 +2780,4 @@ void EncodeColumnIndexes(device_span<EncColumnChunk> chunks,
     chunks, column_stats, column_index_truncate_length);
 }
 
-}  // namespace gpu
-}  // namespace parquet
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/page_hdr.cu b/cpp/src/io/parquet/page_hdr.cu
index 6f8b2f50443..595dd40cdc2 100644
--- a/cpp/src/io/parquet/page_hdr.cu
+++ b/cpp/src/io/parquet/page_hdr.cu
@@ -16,34 +16,18 @@
 
 #include "parquet_gpu.hpp"
 #include <io/utilities/block_utils.cuh>
+
+#include <cudf/detail/utilities/cuda.cuh>
+
 #include <thrust/tuple.h>
 
 #include <rmm/cuda_stream_view.hpp>
 
-namespace cudf {
-namespace io {
-namespace parquet {
-namespace gpu {
+namespace cudf::io::parquet::detail {
+
 // Minimal thrift implementation for parsing page headers
 // https://github.com/apache/thrift/blob/master/doc/specs/thrift-compact-protocol.md
 
-static const __device__ __constant__ uint8_t g_list2struct[16] = {0,
-                                                                  1,
-                                                                  2,
-                                                                  ST_FLD_BYTE,
-                                                                  ST_FLD_DOUBLE,
-                                                                  5,
-                                                                  ST_FLD_I16,
-                                                                  7,
-                                                                  ST_FLD_I32,
-                                                                  9,
-                                                                  ST_FLD_I64,
-                                                                  ST_FLD_BINARY,
-                                                                  ST_FLD_STRUCT,
-                                                                  ST_FLD_MAP,
-                                                                  ST_FLD_SET,
-                                                                  ST_FLD_LIST};
-
 struct byte_stream_s {
   uint8_t const* cur{};
   uint8_t const* end{};
@@ -142,12 +126,13 @@ __device__ void skip_struct_field(byte_stream_s* bs, int field_type)
       case ST_FLD_SET: {  // NOTE: skipping a list of lists is not handled
         auto const c = getb(bs);
         int n        = c >> 4;
-        if (n == 0xf) n = get_u32(bs);
-        field_type = g_list2struct[c & 0xf];
-        if (field_type == ST_FLD_STRUCT)
+        if (n == 0xf) { n = get_u32(bs); }
+        field_type = c & 0xf;
+        if (field_type == ST_FLD_STRUCT) {
           struct_depth += n;
-        else
+        } else {
           rep_cnt = n;
+        }
       } break;
       case ST_FLD_STRUCT: struct_depth++; break;
     }
@@ -161,19 +146,21 @@ __device__ void skip_struct_field(byte_stream_s* bs, int field_type)
  * @param chunk Column chunk the page belongs to
  * @return `kernel_mask_bits` value for the given page
  */
-__device__ uint32_t kernel_mask_for_page(gpu::PageInfo const& page,
-                                         gpu::ColumnChunkDesc const& chunk)
+__device__ decode_kernel_mask kernel_mask_for_page(PageInfo const& page,
+                                                   ColumnChunkDesc const& chunk)
 {
-  if (page.flags & PAGEINFO_FLAGS_DICTIONARY) { return 0; }
+  if (page.flags & PAGEINFO_FLAGS_DICTIONARY) { return decode_kernel_mask::NONE; }
 
   if (page.encoding == Encoding::DELTA_BINARY_PACKED) {
-    return KERNEL_MASK_DELTA_BINARY;
+    return decode_kernel_mask::DELTA_BINARY;
+  } else if (page.encoding == Encoding::DELTA_BYTE_ARRAY) {
+    return decode_kernel_mask::DELTA_BYTE_ARRAY;
   } else if (is_string_col(chunk)) {
-    return KERNEL_MASK_STRING;
+    return decode_kernel_mask::STRING;
   }
 
   // non-string, non-delta
-  return KERNEL_MASK_GENERAL;
+  return decode_kernel_mask::GENERAL;
 }
 
 /**
@@ -359,16 +346,20 @@ struct gpuParsePageHeader {
  */
 // blockDim {128,1,1}
 __global__ void __launch_bounds__(128)
-  gpuDecodePageHeaders(ColumnChunkDesc* chunks, int32_t num_chunks)
+  gpuDecodePageHeaders(ColumnChunkDesc* chunks, int32_t num_chunks, int32_t* error_code)
 {
+  using cudf::detail::warp_size;
   gpuParsePageHeader parse_page_header;
   __shared__ byte_stream_s bs_g[4];
 
-  int lane_id             = threadIdx.x % 32;
-  int chunk               = (blockIdx.x * 4) + (threadIdx.x / 32);
-  byte_stream_s* const bs = &bs_g[threadIdx.x / 32];
+  int32_t error[4]   = {0};
+  auto const lane_id = threadIdx.x % warp_size;
+  auto const warp_id = threadIdx.x / warp_size;
+  auto const chunk   = (blockIdx.x * 4) + warp_id;
+  auto const bs      = &bs_g[warp_id];
 
-  if (chunk < num_chunks and lane_id == 0) bs->ck = chunks[chunk];
+  if (chunk < num_chunks and lane_id == 0) { bs->ck = chunks[chunk]; }
+  if (lane_id == 0) { error[warp_id] = 0; }
   __syncthreads();
 
   if (chunk < num_chunks) {
@@ -379,7 +370,7 @@ __global__ void __launch_bounds__(128)
     int32_t num_dict_pages = bs->ck.num_dict_pages;
     PageInfo* page_info;
 
-    if (!lane_id) {
+    if (lane_id == 0) {
       bs->base = bs->cur      = bs->ck.compressed_data;
       bs->end                 = bs->base + bs->ck.compressed_size;
       bs->page.chunk_idx      = chunk;
@@ -392,7 +383,9 @@ __global__ void __launch_bounds__(128)
       bs->page.skipped_values      = -1;
       bs->page.skipped_leaf_values = 0;
       bs->page.str_bytes           = 0;
-      bs->page.kernel_mask         = 0;
+      bs->page.temp_string_size    = 0;
+      bs->page.temp_string_buf     = nullptr;
+      bs->page.kernel_mask         = decode_kernel_mask::NONE;
     }
     num_values     = bs->ck.num_values;
     page_info      = bs->ck.page_info;
@@ -415,6 +408,9 @@ __global__ void __launch_bounds__(128)
         bs->page.lvl_bytes[level_type::DEFINITION] = 0;
         bs->page.lvl_bytes[level_type::REPETITION] = 0;
         if (parse_page_header(bs) && bs->page.compressed_page_size >= 0) {
+          if (not is_supported_encoding(bs->page.encoding)) {
+            error[warp_id] |= static_cast<int32_t>(decode_error::UNSUPPORTED_ENCODING);
+          }
           switch (bs->page_type) {
             case PageType::DATA_PAGE:
               index_out = num_dict_pages + data_page_count;
@@ -443,20 +439,25 @@ __global__ void __launch_bounds__(128)
           }
           bs->page.page_data = const_cast<uint8_t*>(bs->cur);
           bs->cur += bs->page.compressed_page_size;
+          if (bs->cur > bs->end) {
+            error[warp_id] |= static_cast<int32_t>(decode_error::DATA_STREAM_OVERRUN);
+          }
           bs->page.kernel_mask = kernel_mask_for_page(bs->page, bs->ck);
         } else {
           bs->cur = bs->end;
         }
       }
       index_out = shuffle(index_out);
-      if (index_out >= 0 && index_out < max_num_pages && lane_id == 0)
+      if (index_out >= 0 && index_out < max_num_pages && lane_id == 0) {
         page_info[index_out] = bs->page;
+      }
       num_values = shuffle(num_values);
       __syncwarp();
     }
     if (lane_id == 0) {
       chunks[chunk].num_data_pages = data_page_count;
       chunks[chunk].num_dict_pages = dictionary_page_count;
+      if (error[warp_id] != 0) { set_error(error[warp_id], error_code); }
     }
   }
 }
@@ -512,11 +513,12 @@ __global__ void __launch_bounds__(128)
 
 void __host__ DecodePageHeaders(ColumnChunkDesc* chunks,
                                 int32_t num_chunks,
+                                int32_t* error_code,
                                 rmm::cuda_stream_view stream)
 {
   dim3 dim_block(128, 1);
   dim3 dim_grid((num_chunks + 3) >> 2, 1);  // 1 chunk per warp, 4 warps per block
-  gpuDecodePageHeaders<<<dim_grid, dim_block, 0, stream.value()>>>(chunks, num_chunks);
+  gpuDecodePageHeaders<<<dim_grid, dim_block, 0, stream.value()>>>(chunks, num_chunks, error_code);
 }
 
 void __host__ BuildStringDictionaryIndex(ColumnChunkDesc* chunks,
@@ -528,7 +530,4 @@ void __host__ BuildStringDictionaryIndex(ColumnChunkDesc* chunks,
   gpuBuildStringDictionaryIndex<<<dim_grid, dim_block, 0, stream.value()>>>(chunks, num_chunks);
 }
 
-}  // namespace gpu
-}  // namespace parquet
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu
index d79abe4a6d2..e9ac3657e36 100644
--- a/cpp/src/io/parquet/page_string_decode.cu
+++ b/cpp/src/io/parquet/page_string_decode.cu
@@ -14,23 +14,28 @@
  * limitations under the License.
  */
 
+#include "delta_binary.cuh"
 #include "page_decode.cuh"
 #include "page_string_utils.cuh"
 
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/stream_pool.hpp>
 #include <cudf/strings/detail/gather.cuh>
 
-namespace cudf {
-namespace io {
-namespace parquet {
-namespace gpu {
+#include <thrust/logical.h>
+#include <thrust/transform_scan.h>
+
+#include <bitset>
+
+namespace cudf::io::parquet::detail {
 
 namespace {
 
-constexpr int preprocess_block_size = 512;
-constexpr int decode_block_size     = 128;
-constexpr int rolling_buf_size      = decode_block_size * 2;
-constexpr int preproc_buf_size      = LEVEL_DECODE_BUF_SIZE;
+constexpr int preprocess_block_size    = 512;
+constexpr int decode_block_size        = 128;
+constexpr int delta_preproc_block_size = 64;
+constexpr int rolling_buf_size         = decode_block_size * 2;
+constexpr int preproc_buf_size         = LEVEL_DECODE_BUF_SIZE;
 
 /**
  * @brief Compute the start and end page value bounds for this page
@@ -136,6 +141,25 @@ __device__ thrust::pair<int, int> page_bounds(page_state_s* const s,
     bool skipped_values_set = false;
     bool end_value_set      = false;
 
+    // If page_start_row >= min_row, then skipped_values is 0 and we don't have to search for
+    // start_value. If there's repetition then we've already calculated
+    // skipped_values/skipped_leaf_values.
+    // TODO(ets): If we hit this condition, and end_row > last row in page, then we can skip
+    // more of the processing below.
+    if (has_repetition or page_start_row >= min_row) {
+      if (t == 0) {
+        if (has_repetition) {
+          skipped_values      = pp->skipped_values;
+          skipped_leaf_values = pp->skipped_leaf_values;
+        } else {
+          skipped_values      = 0;
+          skipped_leaf_values = 0;
+        }
+      }
+      skipped_values_set = true;
+      __syncthreads();
+    }
+
     while (processed < s->page.num_input_values) {
       thread_index_type start_val = processed;
 
@@ -145,11 +169,6 @@ __device__ thrust::pair<int, int> page_bounds(page_state_s* const s,
 
         // special case where page does not begin at a row boundary
         if (processed == 0 && rep_decode[0] != 0) {
-          if (t == 0) {
-            skipped_values      = 0;
-            skipped_leaf_values = 0;
-          }
-          skipped_values_set = true;
           end_row++;  // need to finish off the previous row
           row_fudge = 0;
         }
@@ -453,12 +472,107 @@ __device__ size_t totalPlainEntriesSize(uint8_t const* data,
 }
 
 /**
- * @brief Kernel for computing string page output size information.
+ * @brief Compute string size information for DELTA_BYTE_ARRAY encoded strings.
+ *
+ * This traverses the packed prefix and suffix lengths, summing them to obtain the total
+ * number of bytes needed for the decoded string data. It also calculates an upper bound
+ * for the largest string length to obtain an upper bound on temporary space needed if
+ * rows will be skipped.
+ *
+ * Called with 64 threads.
+ *
+ * @param data Pointer to the start of the page data stream
+ * @param end Pointer to the end of the page data stream
+ * @param start_value Do not count values that occur before this index
+ * @param end_value Do not count values that occur after this index
+ * @return A pair of `size_t` values representing the total string size and temp buffer size
+ * required for decoding
+ */
+__device__ thrust::pair<size_t, size_t> totalDeltaByteArraySize(uint8_t const* data,
+                                                                uint8_t const* end,
+                                                                int start_value,
+                                                                int end_value)
+{
+  using cudf::detail::warp_size;
+  using WarpReduce = cub::WarpReduce<uleb128_t>;
+  __shared__ typename WarpReduce::TempStorage temp_storage[2];
+
+  __shared__ __align__(16) delta_binary_decoder prefixes;
+  __shared__ __align__(16) delta_binary_decoder suffixes;
+
+  int const t       = threadIdx.x;
+  int const lane_id = t % warp_size;
+  int const warp_id = t / warp_size;
+
+  if (t == 0) {
+    auto const* suffix_start = prefixes.find_end_of_block(data, end);
+    suffixes.init_binary_block(suffix_start, end);
+  }
+  __syncthreads();
+
+  // two warps will traverse the prefixes and suffixes and sum them up
+  auto const db = t < warp_size ? &prefixes : t < 2 * warp_size ? &suffixes : nullptr;
+
+  size_t total_bytes = 0;
+  uleb128_t max_len  = 0;
+
+  if (db != nullptr) {
+    // initialize with first value (which is stored in last_value)
+    if (lane_id == 0 && start_value == 0) { total_bytes = db->last_value; }
+
+    uleb128_t lane_sum = 0;
+    uleb128_t lane_max = 0;
+    while (db->current_value_idx < end_value &&
+           db->current_value_idx < db->num_encoded_values(true)) {
+      // calculate values for current mini-block
+      db->calc_mini_block_values(lane_id);
+
+      // get per lane sum for mini-block
+      for (uint32_t i = 0; i < db->values_per_mb; i += 32) {
+        uint32_t const idx = db->current_value_idx + i + lane_id;
+        if (idx >= start_value && idx < end_value && idx < db->value_count) {
+          lane_sum += db->value[rolling_index<delta_rolling_buf_size>(idx)];
+          lane_max = max(lane_max, db->value[rolling_index<delta_rolling_buf_size>(idx)]);
+        }
+      }
+
+      if (lane_id == 0) { db->setup_next_mini_block(true); }
+      __syncwarp();
+    }
+
+    // get sum for warp.
+    // note: warp_sum will only be valid on lane 0.
+    auto const warp_sum = WarpReduce(temp_storage[warp_id]).Sum(lane_sum);
+    auto const warp_max = WarpReduce(temp_storage[warp_id]).Reduce(lane_max, cub::Max());
+
+    if (lane_id == 0) {
+      total_bytes += warp_sum;
+      max_len = warp_max;
+    }
+  }
+  __syncthreads();
+
+  // now sum up total_bytes from the two warps
+  auto const final_bytes =
+    cudf::detail::single_lane_block_sum_reduce<delta_preproc_block_size, 0>(total_bytes);
+
+  // Sum up prefix and suffix max lengths to get a max possible string length. Multiply that
+  // by the number of strings in a mini-block, plus one to save the last string.
+  auto const temp_bytes =
+    cudf::detail::single_lane_block_sum_reduce<delta_preproc_block_size, 0>(max_len) *
+    (db->values_per_mb + 1);
+
+  return {final_bytes, temp_bytes};
+}
+
+/**
+ * @brief Kernel for computing string page bounds information.
  *
- * String columns need accurate data size information to preallocate memory in the column buffer to
- * store the char data. This calls a kernel to calculate information needed by the string decoding
- * kernel. On exit, the `str_bytes`, `num_nulls`, and `num_valids` fields of the PageInfo struct
- * are updated. This call ignores non-string columns.
+ * This kernel traverses the repetition and definition level data to determine start and end values
+ * for pages with string-like data. Also calculates the number of null and valid values in the
+ * page. Does nothing if the page mask is neither `STRING` nor `DELTA_BYTE_ARRAY`. On exit the
+ * `num_nulls`, `num_valids`, `start_val` and `end_val` fields of the `PageInfo` struct will be
+ * populated.
  *
  * @param pages All pages to be decoded
  * @param chunks All chunks to be decoded
@@ -467,7 +581,7 @@ __device__ size_t totalPlainEntriesSize(uint8_t const* data,
  * @tparam level_t Type used to store decoded repetition and definition levels
  */
 template <typename level_t>
-__global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSizes(
+__global__ void __launch_bounds__(preprocess_block_size) gpuComputeStringPageBounds(
   PageInfo* pages, device_span<ColumnChunkDesc const> chunks, size_t min_row, size_t num_rows)
 {
   __shared__ __align__(16) page_state_s state_g;
@@ -477,8 +591,13 @@ __global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSiz
   int const t           = threadIdx.x;
   PageInfo* const pp    = &pages[page_idx];
 
-  // reset str_bytes to 0 in case it's already been calculated
-  if (t == 0) { pp->str_bytes = 0; }
+  if (t == 0) {
+    s->page.num_nulls  = 0;
+    s->page.num_valids = 0;
+    // reset str_bytes to 0 in case it's already been calculated (esp needed for chunked reads).
+    // TODO: need to rethink this once str_bytes is in the statistics
+    pp->str_bytes = 0;
+  }
 
   // whether or not we have repetition levels (lists)
   bool const has_repetition = chunks[pp->chunk_idx].max_level[level_type::REPETITION] > 0;
@@ -494,23 +613,11 @@ __global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSiz
                                                                                       {rep_runs}};
 
   // setup page info
-  if (!setupLocalPageInfo(
-        s, pp, chunks, min_row, num_rows, mask_filter{KERNEL_MASK_STRING}, false)) {
-    return;
-  }
-
-  if (!t) {
-    s->page.num_nulls  = 0;
-    s->page.num_valids = 0;
-    s->page.str_bytes  = 0;
-  }
-  __syncthreads();
+  auto const mask = BitOr(decode_kernel_mask::STRING, decode_kernel_mask::DELTA_BYTE_ARRAY);
+  if (!setupLocalPageInfo(s, pp, chunks, min_row, num_rows, mask_filter{mask}, true)) { return; }
 
   bool const is_bounds_pg = is_bounds_page(s, min_row, num_rows, has_repetition);
 
-  // if we're skipping this page anyway, no need to count it
-  if (!is_bounds_pg && !is_page_contained(s, min_row, num_rows)) { return; }
-
   // find start/end value indices
   auto const [start_value, end_value] =
     page_bounds(s, min_row, num_rows, is_bounds_pg, has_repetition, decoders);
@@ -519,8 +626,107 @@ __global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSiz
   if (t == 0) {
     pp->num_nulls  = s->page.num_nulls;
     pp->num_valids = s->page.num_valids;
+    pp->start_val  = start_value;
+    pp->end_val    = end_value;
+  }
+}
+
+/**
+ * @brief Kernel for computing string page output size information for delta_byte_array encoding.
+ *
+ * This call ignores columns that are not DELTA_BYTE_ARRAY encoded. On exit the `str_bytes` field
+ * of the `PageInfo` struct will be populated. Also fills in the `temp_string_size` field if rows
+ * are to be skipped.
+ *
+ * @param pages All pages to be decoded
+ * @param chunks All chunks to be decoded
+ * @param min_rows crop all rows below min_row
+ * @param num_rows Maximum number of rows to read
+ */
+__global__ void __launch_bounds__(delta_preproc_block_size) gpuComputeDeltaPageStringSizes(
+  PageInfo* pages, device_span<ColumnChunkDesc const> chunks, size_t min_row, size_t num_rows)
+{
+  __shared__ __align__(16) page_state_s state_g;
+
+  page_state_s* const s = &state_g;
+  int const page_idx    = blockIdx.x;
+  int const t           = threadIdx.x;
+  PageInfo* const pp    = &pages[page_idx];
+
+  // whether or not we have repetition levels (lists)
+  bool const has_repetition = chunks[pp->chunk_idx].max_level[level_type::REPETITION] > 0;
+
+  // setup page info
+  auto const mask = decode_kernel_mask::DELTA_BYTE_ARRAY;
+  if (!setupLocalPageInfo(s, pp, chunks, min_row, num_rows, mask_filter{mask}, true)) { return; }
+
+  auto const start_value = pp->start_val;
+
+  // if data size is known, can short circuit here
+  if ((chunks[pp->chunk_idx].data_type & 7) == FIXED_LEN_BYTE_ARRAY) {
+    if (t == 0) {
+      pp->str_bytes = pp->num_valids * s->dtype_len_in;
+
+      // only need temp space if we're skipping values
+      if (start_value > 0) {
+        // just need to parse the header of the first delta binary block to get values_per_mb
+        delta_binary_decoder db;
+        db.init_binary_block(s->data_start, s->data_end);
+        // save enough for one mini-block plus some extra to save the last_string
+        pp->temp_string_size = s->dtype_len_in * (db.values_per_mb + 1);
+      }
+    }
+  } else {
+    // now process string info in the range [start_value, end_value)
+    // set up for decoding strings...can be either plain or dictionary
+    uint8_t const* data      = s->data_start;
+    uint8_t const* const end = s->data_end;
+    auto const end_value     = pp->end_val;
+
+    auto const [len, temp_bytes] = totalDeltaByteArraySize(data, end, start_value, end_value);
+
+    if (t == 0) {
+      // TODO check for overflow
+      pp->str_bytes = len;
+
+      // only need temp space if we're skipping values
+      if (start_value > 0) { pp->temp_string_size = temp_bytes; }
+    }
+  }
+}
+
+/**
+ * @brief Kernel for computing string page output size information.
+ *
+ * This call ignores non-string columns. On exit the `str_bytes` field of the `PageInfo` struct will
+ * be populated.
+ *
+ * @param pages All pages to be decoded
+ * @param chunks All chunks to be decoded
+ * @param min_rows crop all rows below min_row
+ * @param num_rows Maximum number of rows to read
+ */
+__global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSizes(
+  PageInfo* pages, device_span<ColumnChunkDesc const> chunks, size_t min_row, size_t num_rows)
+{
+  __shared__ __align__(16) page_state_s state_g;
+
+  page_state_s* const s = &state_g;
+  int const page_idx    = blockIdx.x;
+  int const t           = threadIdx.x;
+  PageInfo* const pp    = &pages[page_idx];
+
+  // whether or not we have repetition levels (lists)
+  bool const has_repetition = chunks[pp->chunk_idx].max_level[level_type::REPETITION] > 0;
+
+  // setup page info
+  if (!setupLocalPageInfo(
+        s, pp, chunks, min_row, num_rows, mask_filter{decode_kernel_mask::STRING}, true)) {
+    return;
   }
 
+  bool const is_bounds_pg = is_bounds_page(s, min_row, num_rows, has_repetition);
+
   auto const& col  = s->col;
   size_t str_bytes = 0;
   // short circuit for FIXED_LEN_BYTE_ARRAY
@@ -533,6 +739,8 @@ __global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSiz
     uint8_t const* const end = s->data_end;
     uint8_t const* dict_base = nullptr;
     int dict_size            = 0;
+    auto const start_value   = pp->start_val;
+    auto const end_value     = pp->end_val;
 
     switch (pp->encoding) {
       case Encoding::PLAIN_DICTIONARY:
@@ -564,6 +772,9 @@ __global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSiz
   if (t == 0) {
     // TODO check for overflow
     pp->str_bytes = str_bytes;
+
+    // only need temp space for delta
+    pp->temp_string_size = 0;
   }
 }
 
@@ -589,6 +800,7 @@ __global__ void __launch_bounds__(decode_block_size)
                           size_t num_rows,
                           int32_t* error_code)
 {
+  using cudf::detail::warp_size;
   __shared__ __align__(16) page_state_s state_g;
   __shared__ __align__(4) size_type last_offset;
   __shared__ __align__(16)
@@ -599,10 +811,12 @@ __global__ void __launch_bounds__(decode_block_size)
   auto* const sb        = &state_buffers;
   int const page_idx    = blockIdx.x;
   int const t           = threadIdx.x;
+  int const lane_id     = t % warp_size;
   [[maybe_unused]] null_count_back_copier _{s, t};
 
+  auto const mask = decode_kernel_mask::STRING;
   if (!setupLocalPageInfo(
-        s, &pages[page_idx], chunks, min_row, num_rows, mask_filter{KERNEL_MASK_STRING}, true)) {
+        s, &pages[page_idx], chunks, min_row, num_rows, mask_filter{mask}, true)) {
     return;
   }
 
@@ -633,6 +847,7 @@ __global__ void __launch_bounds__(decode_block_size)
       target_pos = min(s->nz_count, src_pos + decode_block_size - out_thread0);
       if (out_thread0 > 32) { target_pos = min(target_pos, s->dict_pos); }
     }
+    // TODO(ets): see if this sync can be removed
     __syncthreads();
     if (t < 32) {
       // decode repetition and definition levels.
@@ -646,9 +861,9 @@ __global__ void __launch_bounds__(decode_block_size)
 
       // WARP1: Decode dictionary indices, booleans or string positions
       if (s->dict_base) {
-        src_target_pos = gpuDecodeDictionaryIndices<false>(s, sb, src_target_pos, t & 0x1f).first;
+        src_target_pos = gpuDecodeDictionaryIndices<false>(s, sb, src_target_pos, lane_id).first;
       } else {
-        gpuInitStringDescriptors<false>(s, sb, src_target_pos, t & 0x1f);
+        gpuInitStringDescriptors<false>(s, sb, src_target_pos, lane_id);
       }
       if (t == 32) { *(volatile int32_t*)&s->dict_pos = src_target_pos; }
     } else {
@@ -748,37 +963,108 @@ __global__ void __launch_bounds__(decode_block_size)
   auto const offptr = reinterpret_cast<size_type*>(nesting_info_base[leaf_level_index].data_out);
   block_excl_sum<decode_block_size>(offptr, value_count, s->page.str_offset);
 
-  if (t == 0 and s->error != 0) {
-    cuda::atomic_ref<int32_t, cuda::thread_scope_device> ref{*error_code};
-    ref.fetch_or(s->error, cuda::std::memory_order_relaxed);
-  }
+  if (t == 0 and s->error != 0) { set_error(s->error, error_code); }
 }
 
+// Functor used to set the `temp_string_buf` pointer for each page. `data` points to a buffer
+// to be used when skipping rows in the delta_byte_array decoder. Given a page and an offset,
+// set the page's `temp_string_buf` to be `data + offset`.
+struct page_tform_functor {
+  uint8_t* const data;
+
+  __device__ PageInfo operator()(PageInfo& page, int64_t offset)
+  {
+    if (page.temp_string_size != 0) { page.temp_string_buf = data + offset; }
+    return page;
+  }
+};
+
 }  // anonymous namespace
 
 /**
- * @copydoc cudf::io::parquet::gpu::ComputePageStringSizes
+ * @copydoc cudf::io::parquet::detail::ComputePageStringSizes
  */
 void ComputePageStringSizes(cudf::detail::hostdevice_vector<PageInfo>& pages,
                             cudf::detail::hostdevice_vector<ColumnChunkDesc> const& chunks,
+                            rmm::device_uvector<uint8_t>& temp_string_buf,
                             size_t min_row,
                             size_t num_rows,
                             int level_type_size,
+                            uint32_t kernel_mask,
                             rmm::cuda_stream_view stream)
 {
-  dim3 dim_block(preprocess_block_size, 1);
-  dim3 dim_grid(pages.size(), 1);  // 1 threadblock per page
+  dim3 const dim_block(preprocess_block_size, 1);
+  dim3 const dim_grid(pages.size(), 1);  // 1 threadblock per page
   if (level_type_size == 1) {
-    gpuComputePageStringSizes<uint8_t>
+    gpuComputeStringPageBounds<uint8_t>
       <<<dim_grid, dim_block, 0, stream.value()>>>(pages.device_ptr(), chunks, min_row, num_rows);
   } else {
-    gpuComputePageStringSizes<uint16_t>
+    gpuComputeStringPageBounds<uint16_t>
       <<<dim_grid, dim_block, 0, stream.value()>>>(pages.device_ptr(), chunks, min_row, num_rows);
   }
+
+  // kernel mask may contain other kernels we don't need to count
+  int const count_mask =
+    kernel_mask & BitOr(decode_kernel_mask::DELTA_BYTE_ARRAY, decode_kernel_mask::STRING);
+  int const nkernels = std::bitset<32>(count_mask).count();
+  auto const streams = cudf::detail::fork_streams(stream, nkernels);
+
+  int s_idx = 0;
+  if (BitAnd(kernel_mask, decode_kernel_mask::DELTA_BYTE_ARRAY) != 0) {
+    dim3 dim_delta(delta_preproc_block_size, 1);
+    gpuComputeDeltaPageStringSizes<<<dim_grid, dim_delta, 0, streams[s_idx++].value()>>>(
+      pages.device_ptr(), chunks, min_row, num_rows);
+  }
+  if (BitAnd(kernel_mask, decode_kernel_mask::STRING) != 0) {
+    gpuComputePageStringSizes<<<dim_grid, dim_block, 0, streams[s_idx++].value()>>>(
+      pages.device_ptr(), chunks, min_row, num_rows);
+  }
+
+  // synchronize the streams
+  cudf::detail::join_streams(streams, stream);
+
+  // check for needed temp space for DELTA_BYTE_ARRAY
+  auto const need_sizes = thrust::any_of(
+    rmm::exec_policy(stream), pages.d_begin(), pages.d_end(), [] __device__(auto& page) {
+      return page.temp_string_size != 0;
+    });
+
+  if (need_sizes) {
+    // sum up all of the temp_string_sizes
+    auto const page_sizes = [] __device__(PageInfo const& page) { return page.temp_string_size; };
+    auto const total_size = thrust::transform_reduce(rmm::exec_policy(stream),
+                                                     pages.d_begin(),
+                                                     pages.d_end(),
+                                                     page_sizes,
+                                                     0L,
+                                                     thrust::plus<int64_t>{});
+
+    // now do an exclusive scan over the temp_string_sizes to get offsets for each
+    // page's chunk of the temp buffer
+    rmm::device_uvector<int64_t> page_string_offsets(pages.size(), stream);
+    thrust::transform_exclusive_scan(rmm::exec_policy_nosync(stream),
+                                     pages.d_begin(),
+                                     pages.d_end(),
+                                     page_string_offsets.begin(),
+                                     page_sizes,
+                                     0L,
+                                     thrust::plus<int64_t>{});
+
+    // allocate the temp space
+    temp_string_buf.resize(total_size, stream);
+
+    // now use the offsets array to set each page's temp_string_buf pointers
+    thrust::transform(rmm::exec_policy_nosync(stream),
+                      pages.d_begin(),
+                      pages.d_end(),
+                      page_string_offsets.begin(),
+                      pages.d_begin(),
+                      page_tform_functor{temp_string_buf.data()});
+  }
 }
 
 /**
- * @copydoc cudf::io::parquet::gpu::DecodeStringPageData
+ * @copydoc cudf::io::parquet::detail::DecodeStringPageData
  */
 void __host__ DecodeStringPageData(cudf::detail::hostdevice_vector<PageInfo>& pages,
                                    cudf::detail::hostdevice_vector<ColumnChunkDesc> const& chunks,
@@ -802,7 +1088,4 @@ void __host__ DecodeStringPageData(cudf::detail::hostdevice_vector<PageInfo>& pa
   }
 }
 
-}  // namespace gpu
-}  // namespace parquet
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/page_string_utils.cuh b/cpp/src/io/parquet/page_string_utils.cuh
index 9395599b3ff..a81d0a64466 100644
--- a/cpp/src/io/parquet/page_string_utils.cuh
+++ b/cpp/src/io/parquet/page_string_utils.cuh
@@ -18,7 +18,7 @@
 
 #include <cudf/strings/detail/gather.cuh>
 
-namespace cudf::io::parquet::gpu {
+namespace cudf::io::parquet::detail {
 
 // stole this from cudf/strings/detail/gather.cuh. modified to run on a single string on one warp.
 // copies from src to dst in 16B chunks per thread.
@@ -107,4 +107,4 @@ __device__ void block_excl_sum(size_type* arr, size_type length, size_type initi
   }
 }
 
-}  // namespace cudf::io::parquet::gpu
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp
index c2affc774c2..9ab686b99d5 100644
--- a/cpp/src/io/parquet/parquet.hpp
+++ b/cpp/src/io/parquet/parquet.hpp
@@ -18,6 +18,8 @@
 
 #include "parquet_common.hpp"
 
+#include <cudf/types.hpp>
+
 #include <thrust/optional.h>
 
 #include <cstdint>
@@ -25,9 +27,8 @@
 #include <string>
 #include <vector>
 
-namespace cudf {
-namespace io {
-namespace parquet {
+namespace cudf::io::parquet::detail {
+
 constexpr uint32_t parquet_magic = (('P' << 0) | ('A' << 8) | ('R' << 16) | ('1' << 24));
 
 /**
@@ -45,79 +46,102 @@ struct file_ender_s {
   uint32_t magic;
 };
 
-// thrift generated code simplified.
-struct StringType {};
-struct MapType {};
-struct ListType {};
-struct EnumType {};
+// thrift inspired code simplified.
 struct DecimalType {
   int32_t scale     = 0;
   int32_t precision = 0;
 };
-struct DateType {};
-
-struct MilliSeconds {};
-struct MicroSeconds {};
-struct NanoSeconds {};
-using TimeUnit_isset = struct TimeUnit_isset {
-  bool MILLIS{false};
-  bool MICROS{false};
-  bool NANOS{false};
-};
 
 struct TimeUnit {
-  TimeUnit_isset isset;
-  MilliSeconds MILLIS;
-  MicroSeconds MICROS;
-  NanoSeconds NANOS;
+  enum Type { UNDEFINED, MILLIS, MICROS, NANOS };
+  Type type;
 };
 
 struct TimeType {
-  bool isAdjustedToUTC = false;
-  TimeUnit unit;
+  // Default to true because the timestamps are implicitly in UTC
+  // Writer option overrides this default
+  bool isAdjustedToUTC = true;
+  TimeUnit unit        = {TimeUnit::MILLIS};
 };
+
 struct TimestampType {
-  bool isAdjustedToUTC = false;
-  TimeUnit unit;
+  // Default to true because the timestamps are implicitly in UTC
+  // Writer option overrides this default
+  bool isAdjustedToUTC = true;
+  TimeUnit unit        = {TimeUnit::MILLIS};
 };
+
 struct IntType {
   int8_t bitWidth = 0;
   bool isSigned   = false;
 };
-struct NullType {};
-struct JsonType {};
-struct BsonType {};
-
-// thrift generated code simplified.
-using LogicalType_isset = struct LogicalType_isset {
-  bool STRING{false};
-  bool MAP{false};
-  bool LIST{false};
-  bool ENUM{false};
-  bool DECIMAL{false};
-  bool DATE{false};
-  bool TIME{false};
-  bool TIMESTAMP{false};
-  bool INTEGER{false};
-  bool UNKNOWN{false};
-  bool JSON{false};
-  bool BSON{false};
-};
 
 struct LogicalType {
-  LogicalType_isset isset;
-  StringType STRING;
-  MapType MAP;
-  ListType LIST;
-  EnumType ENUM;
-  DecimalType DECIMAL;
-  DateType DATE;
-  TimeType TIME;
-  TimestampType TIMESTAMP;
-  IntType INTEGER;
-  NullType UNKNOWN;
-  JsonType JSON;
-  BsonType BSON;
+  enum Type {
+    UNDEFINED,
+    STRING,
+    MAP,
+    LIST,
+    ENUM,
+    DECIMAL,
+    DATE,
+    TIME,
+    TIMESTAMP,
+    // 9 is reserved
+    INTEGER = 10,
+    UNKNOWN,
+    JSON,
+    BSON
+  };
+  Type type;
+  thrust::optional<DecimalType> decimal_type;
+  thrust::optional<TimeType> time_type;
+  thrust::optional<TimestampType> timestamp_type;
+  thrust::optional<IntType> int_type;
+
+  LogicalType(Type tp = UNDEFINED) : type(tp) {}
+  LogicalType(DecimalType&& dt) : type(DECIMAL), decimal_type(dt) {}
+  LogicalType(TimeType&& tt) : type(TIME), time_type(tt) {}
+  LogicalType(TimestampType&& tst) : type(TIMESTAMP), timestamp_type(tst) {}
+  LogicalType(IntType&& it) : type(INTEGER), int_type(it) {}
+
+  constexpr bool is_time_millis() const
+  {
+    return type == TIME and time_type->unit.type == TimeUnit::MILLIS;
+  }
+
+  constexpr bool is_time_micros() const
+  {
+    return type == TIME and time_type->unit.type == TimeUnit::MICROS;
+  }
+
+  constexpr bool is_time_nanos() const
+  {
+    return type == TIME and time_type->unit.type == TimeUnit::NANOS;
+  }
+
+  constexpr bool is_timestamp_millis() const
+  {
+    return type == TIMESTAMP and timestamp_type->unit.type == TimeUnit::MILLIS;
+  }
+
+  constexpr bool is_timestamp_micros() const
+  {
+    return type == TIMESTAMP and timestamp_type->unit.type == TimeUnit::MICROS;
+  }
+
+  constexpr bool is_timestamp_nanos() const
+  {
+    return type == TIMESTAMP and timestamp_type->unit.type == TimeUnit::NANOS;
+  }
+
+  constexpr int8_t bit_width() const { return type == INTEGER ? int_type->bitWidth : -1; }
+
+  constexpr bool is_signed() const { return type == INTEGER and int_type->isSigned; }
+
+  constexpr int32_t scale() const { return type == DECIMAL ? decimal_type->scale : -1; }
+
+  constexpr int32_t precision() const { return type == DECIMAL ? decimal_type->precision : -1; }
 };
 
 /**
@@ -126,8 +150,6 @@ struct LogicalType {
 struct ColumnOrder {
   enum Type { UNDEFINED, TYPE_ORDER };
   Type type;
-
-  operator Type() const { return type; }
 };
 
 /**
@@ -137,24 +159,35 @@ struct ColumnOrder {
  * as a schema tree.
  */
 struct SchemaElement {
-  Type type                    = UNDEFINED_TYPE;
-  ConvertedType converted_type = UNKNOWN;
-  LogicalType logical_type;
-  int32_t type_length =
-    0;  // Byte length of FIXED_LENGTH_BYTE_ARRAY elements, or maximum bit length for other types
+  // 1: parquet physical type for output
+  Type type = UNDEFINED_TYPE;
+  // 2: byte length of FIXED_LENGTH_BYTE_ARRAY elements, or maximum bit length for other types
+  int32_t type_length = 0;
+  // 3: repetition of the field
   FieldRepetitionType repetition_type = REQUIRED;
-  std::string name                    = "";
-  int32_t num_children                = 0;
-  int32_t decimal_scale               = 0;
-  int32_t decimal_precision           = 0;
-  thrust::optional<int32_t> field_id  = thrust::nullopt;
-  bool output_as_byte_array           = false;
+  // 4: name of the field
+  std::string name = "";
+  // 5: nested fields
+  int32_t num_children = 0;
+  // 6: DEPRECATED: record the original type before conversion to parquet type
+  thrust::optional<ConvertedType> converted_type;
+  // 7: DEPRECATED: record the scale for DECIMAL converted type
+  int32_t decimal_scale = 0;
+  // 8: DEPRECATED: record the precision for DECIMAL converted type
+  int32_t decimal_precision = 0;
+  // 9: save field_id from original schema
+  thrust::optional<int32_t> field_id;
+  // 10: replaces converted type
+  thrust::optional<LogicalType> logical_type;
+
+  // extra cudf specific fields
+  bool output_as_byte_array = false;
 
   // The following fields are filled in later during schema initialization
   int max_definition_level = 0;
   int max_repetition_level = 0;
-  int parent_idx           = 0;
-  std::vector<size_t> children_idx;
+  size_type parent_idx     = 0;
+  std::vector<size_type> children_idx;
 
   bool operator==(SchemaElement const& other) const
   {
@@ -206,7 +239,7 @@ struct SchemaElement {
   {
     return type == UNDEFINED_TYPE &&
            // this assumption might be a little weak.
-           ((repetition_type != REPEATED) || (repetition_type == REPEATED && num_children == 2));
+           ((repetition_type != REPEATED) || (repetition_type == REPEATED && num_children > 1));
   }
 };
 
@@ -214,12 +247,18 @@ struct SchemaElement {
  * @brief Thrift-derived struct describing column chunk statistics
  */
 struct Statistics {
-  std::vector<uint8_t> max;        // deprecated max value in signed comparison order
-  std::vector<uint8_t> min;        // deprecated min value in signed comparison order
-  int64_t null_count     = -1;     // count of null values in the column
-  int64_t distinct_count = -1;     // count of distinct values occurring
-  std::vector<uint8_t> max_value;  // max value for column determined by ColumnOrder
-  std::vector<uint8_t> min_value;  // min value for column determined by ColumnOrder
+  // deprecated max value in signed comparison order
+  thrust::optional<std::vector<uint8_t>> max;
+  // deprecated min value in signed comparison order
+  thrust::optional<std::vector<uint8_t>> min;
+  // count of null values in the column
+  thrust::optional<int64_t> null_count;
+  // count of distinct values occurring
+  thrust::optional<int64_t> distinct_count;
+  // max value for column determined by ColumnOrder
+  thrust::optional<std::vector<uint8_t>> max_value;
+  // min value for column determined by ColumnOrder
+  thrust::optional<std::vector<uint8_t>> min_value;
 };
 
 /**
@@ -405,6 +444,4 @@ static inline int CountLeadingZeros32(uint32_t value)
 #endif
 }
 
-}  // namespace parquet
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/parquet_common.hpp b/cpp/src/io/parquet/parquet_common.hpp
index 5a1716bb547..50736197eb9 100644
--- a/cpp/src/io/parquet/parquet_common.hpp
+++ b/cpp/src/io/parquet/parquet_common.hpp
@@ -18,9 +18,8 @@
 
 #include <cstdint>
 
-namespace cudf {
-namespace io {
-namespace parquet {
+namespace cudf::io::parquet::detail {
+
 // Max decimal precisions according to the parquet spec:
 // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#decimal
 auto constexpr MAX_DECIMAL32_PRECISION  = 9;
@@ -156,6 +155,4 @@ enum FieldType {
   ST_FLD_STRUCT = 12,
 };
 
-}  // namespace parquet
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/parquet_gpu.cuh b/cpp/src/io/parquet/parquet_gpu.cuh
index dc74bee1536..10e12ebb782 100644
--- a/cpp/src/io/parquet/parquet_gpu.cuh
+++ b/cpp/src/io/parquet/parquet_gpu.cuh
@@ -23,7 +23,7 @@
 
 #include <cuco/static_map.cuh>
 
-namespace cudf::io::parquet::gpu {
+namespace cudf::io::parquet::detail {
 
 auto constexpr KEY_SENTINEL   = size_type{-1};
 auto constexpr VALUE_SENTINEL = size_type{-1};
@@ -81,4 +81,4 @@ inline size_type __device__ row_to_value_idx(size_type idx,
   return idx;
 }
 
-}  // namespace cudf::io::parquet::gpu
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index 51c862b376b..129d4e4d28c 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -31,11 +31,14 @@
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
+#include <cuda/atomic>
+
 #include <cuda_runtime.h>
 
+#include <type_traits>
 #include <vector>
 
-namespace cudf::io::parquet {
+namespace cudf::io::parquet::detail {
 
 using cudf::io::detail::string_index_pair;
 
@@ -54,19 +57,46 @@ constexpr int rolling_index(int index)
   return index % rolling_size;
 }
 
+// see setupLocalPageInfo() in page_decode.cuh for supported page encodings
+constexpr bool is_supported_encoding(Encoding enc)
+{
+  switch (enc) {
+    case Encoding::PLAIN:
+    case Encoding::PLAIN_DICTIONARY:
+    case Encoding::RLE:
+    case Encoding::RLE_DICTIONARY:
+    case Encoding::DELTA_BINARY_PACKED:
+    case Encoding::DELTA_BYTE_ARRAY: return true;
+    default: return false;
+  }
+}
+
+/**
+ * @brief Atomically OR `error` into `error_code`.
+ */
+constexpr void set_error(int32_t error, int32_t* error_code)
+{
+  if (error != 0) {
+    cuda::atomic_ref<int32_t, cuda::thread_scope_device> ref{*error_code};
+    ref.fetch_or(error, cuda::std::memory_order_relaxed);
+  }
+}
+
 /**
  * @brief Enum for the different types of errors that can occur during decoding.
  *
  * These values are used as bitmasks, so they must be powers of 2.
  */
 enum class decode_error : int32_t {
-  DATA_STREAM_OVERRUN  = 0x1,
-  LEVEL_STREAM_OVERRUN = 0x2,
-  UNSUPPORTED_ENCODING = 0x4,
-  INVALID_LEVEL_RUN    = 0x8,
-  INVALID_DATA_TYPE    = 0x10,
-  EMPTY_PAGE           = 0x20,
-  INVALID_DICT_WIDTH   = 0x40,
+  DATA_STREAM_OVERRUN      = 0x1,
+  LEVEL_STREAM_OVERRUN     = 0x2,
+  UNSUPPORTED_ENCODING     = 0x4,
+  INVALID_LEVEL_RUN        = 0x8,
+  INVALID_DATA_TYPE        = 0x10,
+  EMPTY_PAGE               = 0x20,
+  INVALID_DICT_WIDTH       = 0x40,
+  DELTA_PARAM_MISMATCH     = 0x80,
+  DELTA_PARAMS_UNSUPPORTED = 0x100,
 };
 
 /**
@@ -88,7 +118,47 @@ struct input_column_info {
   auto nesting_depth() const { return nesting.size(); }
 };
 
-namespace gpu {
+// The delta encodings use ULEB128 integers, but parquet only uses max 64 bits.
+using uleb128_t   = uint64_t;
+using zigzag128_t = int64_t;
+
+// this is in C++23
+#if !defined(__cpp_lib_is_scoped_enum)
+template <typename Enum, bool = std::is_enum_v<Enum>>
+struct is_scoped_enum {
+  static const bool value = not std::is_convertible_v<Enum, std::underlying_type_t<Enum>>;
+};
+
+template <typename Enum>
+struct is_scoped_enum<Enum, false> {
+  static const bool value = false;
+};
+#else
+using std::is_scoped_enum;
+#endif
+
+// helpers to do bit operations on scoped enums
+template <class T1,
+          class T2,
+          typename std::enable_if_t<(is_scoped_enum<T1>::value and std::is_same_v<T1, T2>) or
+                                    (is_scoped_enum<T1>::value and std::is_same_v<uint32_t, T2>) or
+                                    (is_scoped_enum<T2>::value and std::is_same_v<uint32_t, T1>)>* =
+            nullptr>
+constexpr uint32_t BitAnd(T1 a, T2 b)
+{
+  return static_cast<uint32_t>(a) & static_cast<uint32_t>(b);
+}
+
+template <class T1,
+          class T2,
+          typename std::enable_if_t<(is_scoped_enum<T1>::value and std::is_same_v<T1, T2>) or
+                                    (is_scoped_enum<T1>::value and std::is_same_v<uint32_t, T2>) or
+                                    (is_scoped_enum<T2>::value and std::is_same_v<uint32_t, T1>)>* =
+            nullptr>
+constexpr uint32_t BitOr(T1 a, T2 b)
+{
+  return static_cast<uint32_t>(a) | static_cast<uint32_t>(b);
+}
 
 /**
  * @brief Enums for the flags in the page header
@@ -113,10 +183,12 @@ enum level_type {
  *
  * Used to control which decode kernels to run.
  */
-enum kernel_mask_bits {
-  KERNEL_MASK_GENERAL      = (1 << 0),  // Run catch-all decode kernel
-  KERNEL_MASK_STRING       = (1 << 1),  // Run decode kernel for string data
-  KERNEL_MASK_DELTA_BINARY = (1 << 2)   // Run decode kernel for DELTA_BINARY_PACKED data
+enum class decode_kernel_mask {
+  NONE             = 0,
+  GENERAL          = (1 << 0),  // Run catch-all decode kernel
+  STRING           = (1 << 1),  // Run decode kernel for string data
+  DELTA_BINARY     = (1 << 2),  // Run decode kernel for DELTA_BINARY_PACKED data
+  DELTA_BYTE_ARRAY = (1 << 3)   // Run decode kernel for DELTA_BYTE_ARRAY encoded data
 };
 
 /**
@@ -197,9 +269,11 @@ struct PageInfo {
   int32_t num_input_values;
   int32_t chunk_row;  // starting row of this page relative to the start of the chunk
   int32_t num_rows;   // number of rows in this page
-  // the next two are calculated in gpuComputePageStringSizes
+  // the next four are calculated in gpuComputePageStringSizes
   int32_t num_nulls;       // number of null values (V2 header), but recalculated for string cols
   int32_t num_valids;      // number of non-null values, taking into account skip_rows/num_rows
+  int32_t start_val;       // index of first value of the string data stream to use
+  int32_t end_val;         // index of last value in string data stream
   int32_t chunk_idx;       // column chunk this page belongs to
   int32_t src_col_schema;  // schema index of this column
   uint8_t flags;           // PAGEINFO_FLAGS_XXX
@@ -236,7 +310,11 @@ struct PageInfo {
   // level decode buffers
   uint8_t* lvl_decode_buf[level_type::NUM_LEVEL_TYPES];
 
-  uint32_t kernel_mask;
+  // temporary space for decoding DELTA_BYTE_ARRAY encoded strings
+  int64_t temp_string_size;
+  uint8_t* temp_string_buf;
+
+  decode_kernel_mask kernel_mask;
 };
 
 /**
@@ -258,7 +336,7 @@ struct ColumnChunkDesc {
                            uint8_t rep_level_bits_,
                            int8_t codec_,
                            int8_t converted_type_,
-                           LogicalType logical_type_,
+                           thrust::optional<LogicalType> logical_type_,
                            int8_t decimal_precision_,
                            int32_t ts_clock_rate_,
                            int32_t src_col_index_,
@@ -300,99 +378,26 @@ struct ColumnChunkDesc {
   uint16_t data_type{};  // basic column data type, ((type_length << 3) |
                          // parquet::Type)
   uint8_t
-    level_bits[level_type::NUM_LEVEL_TYPES]{};  // bits to encode max definition/repetition levels
-  int32_t num_data_pages{};                     // number of data pages
-  int32_t num_dict_pages{};                     // number of dictionary pages
-  int32_t max_num_pages{};                      // size of page_info array
-  PageInfo* page_info{};                        // output page info for up to num_dict_pages +
-                                                // num_data_pages (dictionary pages first)
-  string_index_pair* str_dict_index{};          // index for string dictionary
-  bitmask_type** valid_map_base{};              // base pointers of valid bit map for this column
-  void** column_data_base{};                    // base pointers of column data
-  void** column_string_base{};                  // base pointers of column string data
-  int8_t codec{};                               // compressed codec enum
-  int8_t converted_type{};                      // converted type enum
-  LogicalType logical_type{};                   // logical type
-  int8_t decimal_precision{};                   // Decimal precision
+    level_bits[level_type::NUM_LEVEL_TYPES]{};   // bits to encode max definition/repetition levels
+  int32_t num_data_pages{};                      // number of data pages
+  int32_t num_dict_pages{};                      // number of dictionary pages
+  int32_t max_num_pages{};                       // size of page_info array
+  PageInfo* page_info{};                         // output page info for up to num_dict_pages +
+                                                 // num_data_pages (dictionary pages first)
+  string_index_pair* str_dict_index{};           // index for string dictionary
+  bitmask_type** valid_map_base{};               // base pointers of valid bit map for this column
+  void** column_data_base{};                     // base pointers of column data
+  void** column_string_base{};                   // base pointers of column string data
+  int8_t codec{};                                // compressed codec enum
+  int8_t converted_type{};                       // converted type enum
+  thrust::optional<LogicalType> logical_type{};  // logical type
+  int8_t decimal_precision{};                    // Decimal precision
   int32_t ts_clock_rate{};  // output timestamp clock frequency (0=default, 1000=ms, 1000000000=ns)
 
   int32_t src_col_index{};   // my input column index
   int32_t src_col_schema{};  // my schema index in the file
 };
 
-/**
- * @brief The row_group_info class
- */
-struct row_group_info {
-  size_type index;  // row group index within a file. aggregate_reader_metadata::get_row_group() is
-                    // called with index and source_index
-  size_t start_row;
-  size_type source_index;  // file index.
-
-  row_group_info() = default;
-
-  row_group_info(size_type index, size_t start_row, size_type source_index)
-    : index{index}, start_row{start_row}, source_index{source_index}
-  {
-  }
-};
-
-/**
- * @brief Struct to store file-level data that remains constant for
- * all passes/chunks for the file.
- */
-struct file_intermediate_data {
-  // all row groups to read
-  std::vector<row_group_info> row_groups{};
-
-  // all chunks from the selected row groups. We may end up reading these chunks progressively
-  // instead of all at once
-  std::vector<gpu::ColumnChunkDesc> chunks{};
-
-  // skip_rows/num_rows values for the entire file. these need to be adjusted per-pass because we
-  // may not be visiting every row group that contains these bounds
-  size_t global_skip_rows;
-  size_t global_num_rows;
-};
-
-/**
- * @brief Structs to identify the reading row range for each chunk of rows in chunked reading.
- */
-struct chunk_read_info {
-  size_t skip_rows;
-  size_t num_rows;
-};
-
-/**
- * @brief Struct to store pass-level data that remains constant for a single pass.
- */
-struct pass_intermediate_data {
-  std::vector<std::unique_ptr<datasource::buffer>> raw_page_data;
-  rmm::device_buffer decomp_page_data;
-
-  // rowgroup, chunk and page information for the current pass.
-  std::vector<row_group_info> row_groups{};
-  cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc> chunks{};
-  cudf::detail::hostdevice_vector<gpu::PageInfo> pages_info{};
-  cudf::detail::hostdevice_vector<gpu::PageNestingInfo> page_nesting_info{};
-  cudf::detail::hostdevice_vector<gpu::PageNestingDecodeInfo> page_nesting_decode_info{};
-
-  rmm::device_uvector<int32_t> page_keys{0, rmm::cuda_stream_default};
-  rmm::device_uvector<int32_t> page_index{0, rmm::cuda_stream_default};
-  rmm::device_uvector<string_index_pair> str_dict_index{0, rmm::cuda_stream_default};
-
-  std::vector<gpu::chunk_read_info> output_chunk_read_info;
-  std::size_t current_output_chunk{0};
-
-  rmm::device_buffer level_decode_data{};
-  int level_type_size{0};
-
-  // skip_rows and num_rows values for this particular pass. these may be adjusted values from the
-  // global values stored in file_intermediate_data.
-  size_t skip_rows;
-  size_t num_rows;
-};
-
 /**
  * @brief Struct describing an encoder column
  */
@@ -446,6 +451,17 @@ constexpr uint32_t encoding_to_mask(Encoding encoding)
   return 1 << static_cast<uint32_t>(encoding);
 }
 
+/**
+ * @brief Enum of mask bits for the EncPage kernel_mask
+ *
+ * Used to control which encode kernels to run.
+ */
+enum class encode_kernel_mask {
+  PLAIN        = (1 << 0),  // Run plain encoding kernel
+  DICTIONARY   = (1 << 1),  // Run dictionary encoding kernel
+  DELTA_BINARY = (1 << 2)   // Run DELTA_BINARY_PACKED encoding kernel
+};
+
 /**
  * @brief Struct describing an encoder column chunk
  */
@@ -504,10 +520,11 @@ struct EncPage {
   uint32_t num_leaf_values;  //!< Values in page. Different from num_rows in case of nested types
   uint32_t num_values;  //!< Number of def/rep level values in page. Includes null/empty elements in
                         //!< non-leaf levels
-  uint32_t def_lvl_bytes;        //!< Number of bytes of encoded definition level data (V2 only)
-  uint32_t rep_lvl_bytes;        //!< Number of bytes of encoded repetition level data (V2 only)
-  compression_result* comp_res;  //!< Ptr to compression result
-  uint32_t num_nulls;            //!< Number of null values (V2 only) (down here for alignment)
+  uint32_t def_lvl_bytes;          //!< Number of bytes of encoded definition level data (V2 only)
+  uint32_t rep_lvl_bytes;          //!< Number of bytes of encoded repetition level data (V2 only)
+  compression_result* comp_res;    //!< Ptr to compression result
+  uint32_t num_nulls;              //!< Number of null values (V2 only) (down here for alignment)
+  encode_kernel_mask kernel_mask;  //!< Mask used to control which encoding kernels to run
 };
 
 /**
@@ -527,9 +544,13 @@ constexpr bool is_string_col(ColumnChunkDesc const& chunk)
  *
  * @param[in] chunks List of column chunks
  * @param[in] num_chunks Number of column chunks
+ * @param[out] error_code Error code for kernel failures
  * @param[in] stream CUDA stream to use
  */
-void DecodePageHeaders(ColumnChunkDesc* chunks, int32_t num_chunks, rmm::cuda_stream_view stream);
+void DecodePageHeaders(ColumnChunkDesc* chunks,
+                       int32_t num_chunks,
+                       int32_t* error_code,
+                       rmm::cuda_stream_view stream);
 
 /**
  * @brief Launches kernel for building the dictionary index for the column
@@ -599,16 +620,20 @@ void ComputePageSizes(cudf::detail::hostdevice_vector<PageInfo>& pages,
  *
  * @param[in,out] pages All pages to be decoded
  * @param[in] chunks All chunks to be decoded
+ * @param[out] temp_string_buf Temporary space needed for decoding DELTA_BYTE_ARRAY strings
  * @param[in] min_rows crop all rows below min_row
  * @param[in] num_rows Maximum number of rows to read
  * @param[in] level_type_size Size in bytes of the type for level decoding
+ * @param[in] kernel_mask Mask of kernels to run
  * @param[in] stream CUDA stream to use
  */
 void ComputePageStringSizes(cudf::detail::hostdevice_vector<PageInfo>& pages,
                             cudf::detail::hostdevice_vector<ColumnChunkDesc> const& chunks,
+                            rmm::device_uvector<uint8_t>& temp_string_buf,
                             size_t min_row,
                             size_t num_rows,
                             int level_type_size,
+                            uint32_t kernel_mask,
                             rmm::cuda_stream_view stream);
 
 /**
@@ -667,7 +692,7 @@ void DecodeStringPageData(cudf::detail::hostdevice_vector<PageInfo>& pages,
  * @param[in] min_row Minimum number of rows to read
  * @param[in] level_type_size Size in bytes of the type for level decoding
  * @param[out] error_code Error code for kernel failures
- * @param[in] stream CUDA stream to use, default 0
+ * @param[in] stream CUDA stream to use
  */
 void DecodeDeltaBinary(cudf::detail::hostdevice_vector<PageInfo>& pages,
                        cudf::detail::hostdevice_vector<ColumnChunkDesc> const& chunks,
@@ -677,6 +702,28 @@ void DecodeDeltaBinary(cudf::detail::hostdevice_vector<PageInfo>& pages,
                        int32_t* error_code,
                        rmm::cuda_stream_view stream);
 
+/**
+ * @brief Launches kernel for reading the DELTA_BYTE_ARRAY column data stored in the pages
+ *
+ * The page data will be written to the output pointed to in the page's
+ * associated column chunk.
+ *
+ * @param[in,out] pages All pages to be decoded
+ * @param[in] chunks All chunks to be decoded
+ * @param[in] num_rows Total number of rows to read
+ * @param[in] min_row Minimum number of rows to read
+ * @param[in] level_type_size Size in bytes of the type for level decoding
+ * @param[out] error_code Error code for kernel failures
+ * @param[in] stream CUDA stream to use
+ */
+void DecodeDeltaByteArray(cudf::detail::hostdevice_vector<PageInfo>& pages,
+                          cudf::detail::hostdevice_vector<ColumnChunkDesc> const& chunks,
+                          size_t num_rows,
+                          size_t min_row,
+                          int level_type_size,
+                          int32_t* error_code,
+                          rmm::cuda_stream_view stream);
+
 /**
  * @brief Launches kernel for initializing encoder row group fragments
  *
@@ -739,7 +786,7 @@ void initialize_chunk_hash_maps(device_span<EncColumnChunk> chunks, rmm::cuda_st
  * @param frags Column fragments
  * @param stream CUDA stream to use
  */
-void populate_chunk_hash_maps(cudf::detail::device_2dspan<gpu::PageFragment const> frags,
+void populate_chunk_hash_maps(cudf::detail::device_2dspan<PageFragment const> frags,
                               rmm::cuda_stream_view stream);
 
 /**
@@ -762,7 +809,7 @@ void collect_map_entries(device_span<EncColumnChunk> chunks, rmm::cuda_stream_vi
  * @param frags Column fragments
  * @param stream CUDA stream to use
  */
-void get_dictionary_indices(cudf::detail::device_2dspan<gpu::PageFragment const> frags,
+void get_dictionary_indices(cudf::detail::device_2dspan<PageFragment const> frags,
                             rmm::cuda_stream_view stream);
 
 /**
@@ -781,7 +828,7 @@ void get_dictionary_indices(cudf::detail::device_2dspan<gpu::PageFragment const>
  * @param[in] stream CUDA stream to use
  */
 void InitEncoderPages(cudf::detail::device_2dspan<EncColumnChunk> chunks,
-                      device_span<gpu::EncPage> pages,
+                      device_span<EncPage> pages,
                       device_span<size_type> page_sizes,
                       device_span<size_type> comp_page_sizes,
                       device_span<parquet_column_device_view const> col_desc,
@@ -847,7 +894,7 @@ void EncodePageHeaders(device_span<EncPage> pages,
  * @param[in] stream CUDA stream to use
  */
 void GatherPages(device_span<EncColumnChunk> chunks,
-                 device_span<gpu::EncPage const> pages,
+                 device_span<EncPage const> pages,
                  rmm::cuda_stream_view stream);
 
 /**
@@ -863,5 +910,4 @@ void EncodeColumnIndexes(device_span<EncColumnChunk> chunks,
                          int32_t column_index_truncate_length,
                          rmm::cuda_stream_view stream);
 
-}  // namespace gpu
-}  // namespace cudf::io::parquet
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/predicate_pushdown.cpp b/cpp/src/io/parquet/predicate_pushdown.cpp
index 805d082c71e..a5851de3c20 100644
--- a/cpp/src/io/parquet/predicate_pushdown.cpp
+++ b/cpp/src/io/parquet/predicate_pushdown.cpp
@@ -35,7 +35,7 @@
 #include <numeric>
 #include <optional>
 
-namespace cudf::io::detail::parquet {
+namespace cudf::io::parquet::detail {
 
 namespace {
 /**
@@ -62,13 +62,13 @@ struct stats_caster {
 
   // uses storage type as T
   template <typename T, CUDF_ENABLE_IF(cudf::is_dictionary<T>() or cudf::is_nested<T>())>
-  static T convert(uint8_t const* stats_val, size_t stats_size, cudf::io::parquet::Type const type)
+  static T convert(uint8_t const* stats_val, size_t stats_size, Type const type)
   {
     CUDF_FAIL("unsupported type for stats casting");
   }
 
   template <typename T, CUDF_ENABLE_IF(cudf::is_boolean<T>())>
-  static T convert(uint8_t const* stats_val, size_t stats_size, cudf::io::parquet::Type const type)
+  static T convert(uint8_t const* stats_val, size_t stats_size, Type const type)
   {
     CUDF_EXPECTS(type == BOOLEAN, "Invalid type and stats combination");
     return targetType<T>(*reinterpret_cast<bool const*>(stats_val));
@@ -78,7 +78,7 @@ struct stats_caster {
   template <typename T,
             CUDF_ENABLE_IF((cudf::is_integral<T>() and !cudf::is_boolean<T>()) or
                            cudf::is_fixed_point<T>() or cudf::is_chrono<T>())>
-  static T convert(uint8_t const* stats_val, size_t stats_size, cudf::io::parquet::Type const type)
+  static T convert(uint8_t const* stats_val, size_t stats_size, Type const type)
   {
     switch (type) {
       case INT32: return targetType<T>(*reinterpret_cast<int32_t const*>(stats_val));
@@ -103,7 +103,7 @@ struct stats_caster {
   }
 
   template <typename T, CUDF_ENABLE_IF(cudf::is_floating_point<T>())>
-  static T convert(uint8_t const* stats_val, size_t stats_size, cudf::io::parquet::Type const type)
+  static T convert(uint8_t const* stats_val, size_t stats_size, Type const type)
   {
     switch (type) {
       case FLOAT: return targetType<T>(*reinterpret_cast<float const*>(stats_val));
@@ -113,7 +113,7 @@ struct stats_caster {
   }
 
   template <typename T, CUDF_ENABLE_IF(std::is_same_v<T, string_view>)>
-  static T convert(uint8_t const* stats_val, size_t stats_size, cudf::io::parquet::Type const type)
+  static T convert(uint8_t const* stats_val, size_t stats_size, Type const type)
   {
     switch (type) {
       case BYTE_ARRAY: [[fallthrough]];
@@ -150,12 +150,14 @@ struct stats_caster {
         {
         }
 
-        void set_index(size_type index, std::vector<uint8_t> const& binary_value, Type const type)
+        void set_index(size_type index,
+                       thrust::optional<std::vector<uint8_t>> const& binary_value,
+                       Type const type)
         {
-          if (!binary_value.empty()) {
-            val[index] = convert<T>(binary_value.data(), binary_value.size(), type);
+          if (binary_value.has_value()) {
+            val[index] = convert<T>(binary_value.value().data(), binary_value.value().size(), type);
           }
-          if (binary_value.empty()) {
+          if (not binary_value.has_value()) {
             clear_bit_unsafe(null_mask.data(), index);
             null_count++;
           }
@@ -210,10 +212,10 @@ struct stats_caster {
           auto const& row_group = per_file_metadata[src_idx].row_groups[rg_idx];
           auto const& colchunk  = row_group.columns[col_idx];
           // To support deprecated min, max fields.
-          auto const& min_value = colchunk.meta_data.statistics.min_value.size() > 0
+          auto const& min_value = colchunk.meta_data.statistics.min_value.has_value()
                                     ? colchunk.meta_data.statistics.min_value
                                     : colchunk.meta_data.statistics.min;
-          auto const& max_value = colchunk.meta_data.statistics.max_value.size() > 0
+          auto const& max_value = colchunk.meta_data.statistics.max_value.has_value()
                                     ? colchunk.meta_data.statistics.max_value
                                     : colchunk.meta_data.statistics.max;
           // translate binary data to Type then to <T>
@@ -527,4 +529,4 @@ named_to_reference_converter::visit_operands(
   return transformed_operands;
 }
 
-}  // namespace cudf::io::detail::parquet
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/reader.cpp b/cpp/src/io/parquet/reader.cpp
index 1e87447006d..17d7c07bc91 100644
--- a/cpp/src/io/parquet/reader.cpp
+++ b/cpp/src/io/parquet/reader.cpp
@@ -16,7 +16,7 @@
 
 #include "reader_impl.hpp"
 
-namespace cudf::io::detail::parquet {
+namespace cudf::io::parquet::detail {
 
 reader::reader() = default;
 
@@ -59,4 +59,4 @@ bool chunked_reader::has_next() const { return _impl->has_next(); }
 
 table_with_metadata chunked_reader::read_chunk() const { return _impl->read_chunk(); }
 
-}  // namespace cudf::io::detail::parquet
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index ea40f29a070..6e799424d01 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -15,30 +15,35 @@
  */
 
 #include "reader_impl.hpp"
+#include "error.hpp"
 
 #include <cudf/detail/stream_compaction.hpp>
 #include <cudf/detail/transform.hpp>
 #include <cudf/detail/utilities/stream_pool.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
-#include <rmm/cuda_stream_pool.hpp>
 
 #include <bitset>
 #include <numeric>
 
-namespace cudf::io::detail::parquet {
+namespace cudf::io::parquet::detail {
 
 void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
 {
-  auto& chunks              = _pass_itm_data->chunks;
-  auto& pages               = _pass_itm_data->pages_info;
-  auto& page_nesting        = _pass_itm_data->page_nesting_info;
-  auto& page_nesting_decode = _pass_itm_data->page_nesting_decode_info;
+  auto& chunks               = _pass_itm_data->chunks;
+  auto& pages                = _pass_itm_data->pages_info;
+  auto& page_nesting         = _pass_itm_data->page_nesting_info;
+  auto& page_nesting_decode  = _pass_itm_data->page_nesting_decode_info;
+  auto const level_type_size = _pass_itm_data->level_type_size;
+
+  // temporary space for DELTA_BYTE_ARRAY decoding. this only needs to live until
+  // gpu::DecodeDeltaByteArray returns.
+  rmm::device_uvector<uint8_t> delta_temp_buf(0, _stream);
 
   // Should not reach here if there is no page data.
   CUDF_EXPECTS(pages.size() > 0, "There is no page to decode");
 
   size_t const sum_max_depths = std::accumulate(
-    chunks.begin(), chunks.end(), 0, [&](size_t cursum, gpu::ColumnChunkDesc const& chunk) {
+    chunks.begin(), chunks.end(), 0, [&](size_t cursum, ColumnChunkDesc const& chunk) {
       return cursum + _metadata->get_output_nesting_depth(chunk.src_col_schema);
     });
 
@@ -51,11 +56,12 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
   // doing a gather operation later on.
   // TODO: This step is somewhat redundant if size info has already been calculated (nested schema,
   // chunked reader).
-  auto const has_strings = (kernel_mask & gpu::KERNEL_MASK_STRING) != 0;
+  auto const has_strings =
+    (kernel_mask & BitOr(decode_kernel_mask::STRING, decode_kernel_mask::DELTA_BYTE_ARRAY)) != 0;
   std::vector<size_t> col_sizes(_input_columns.size(), 0L);
   if (has_strings) {
-    gpu::ComputePageStringSizes(
-      pages, chunks, skip_rows, num_rows, _pass_itm_data->level_type_size, _stream);
+    ComputePageStringSizes(
+      pages, chunks, delta_temp_buf, skip_rows, num_rows, level_type_size, kernel_mask, _stream);
 
     col_sizes = calculate_page_string_offsets();
 
@@ -162,33 +168,37 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
   chunks.host_to_device_async(_stream);
   chunk_nested_valids.host_to_device_async(_stream);
   chunk_nested_data.host_to_device_async(_stream);
+  if (has_strings) { chunk_nested_str_data.host_to_device_async(_stream); }
 
-  rmm::device_scalar<int32_t> error_code(0, _stream);
+  // create this before we fork streams
+  kernel_error error_code(_stream);
 
   // get the number of streams we need from the pool and tell them to wait on the H2D copies
   int const nkernels = std::bitset<32>(kernel_mask).count();
   auto streams       = cudf::detail::fork_streams(_stream, nkernels);
 
-  auto const level_type_size = _pass_itm_data->level_type_size;
-
   // launch string decoder
   int s_idx = 0;
-  if (has_strings) {
-    auto& stream = streams[s_idx++];
-    chunk_nested_str_data.host_to_device_async(stream);
-    gpu::DecodeStringPageData(
-      pages, chunks, num_rows, skip_rows, level_type_size, error_code.data(), stream);
+  if (BitAnd(kernel_mask, decode_kernel_mask::STRING) != 0) {
+    DecodeStringPageData(
+      pages, chunks, num_rows, skip_rows, level_type_size, error_code.data(), streams[s_idx++]);
+  }
+
+  // launch delta byte array decoder
+  if (BitAnd(kernel_mask, decode_kernel_mask::DELTA_BYTE_ARRAY) != 0) {
+    DecodeDeltaByteArray(
+      pages, chunks, num_rows, skip_rows, level_type_size, error_code.data(), streams[s_idx++]);
   }
 
   // launch delta binary decoder
-  if ((kernel_mask & gpu::KERNEL_MASK_DELTA_BINARY) != 0) {
-    gpu::DecodeDeltaBinary(
+  if (BitAnd(kernel_mask, decode_kernel_mask::DELTA_BINARY) != 0) {
+    DecodeDeltaBinary(
       pages, chunks, num_rows, skip_rows, level_type_size, error_code.data(), streams[s_idx++]);
   }
 
   // launch the catch-all page decoder
-  if ((kernel_mask & gpu::KERNEL_MASK_GENERAL) != 0) {
-    gpu::DecodePageData(
+  if (BitAnd(kernel_mask, decode_kernel_mask::GENERAL) != 0) {
+    DecodePageData(
       pages, chunks, num_rows, skip_rows, level_type_size, error_code.data(), streams[s_idx++]);
   }
 
@@ -199,11 +209,8 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
   page_nesting.device_to_host_async(_stream);
   page_nesting_decode.device_to_host_async(_stream);
 
-  auto const decode_error = error_code.value(_stream);
-  if (decode_error != 0) {
-    std::stringstream stream;
-    stream << std::hex << decode_error;
-    CUDF_FAIL("Parquet data decode failed with code(s) 0x" + stream.str());
+  if (error_code.value() != 0) {
+    CUDF_FAIL("Parquet data decode failed with code(s) " + error_code.str());
   }
 
   // for list columns, add the final offset to every offset buffer.
@@ -248,13 +255,13 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
 
   // update null counts in the final column buffers
   for (size_t idx = 0; idx < pages.size(); idx++) {
-    gpu::PageInfo* pi = &pages[idx];
-    if (pi->flags & gpu::PAGEINFO_FLAGS_DICTIONARY) { continue; }
-    gpu::ColumnChunkDesc* col          = &chunks[pi->chunk_idx];
+    PageInfo* pi = &pages[idx];
+    if (pi->flags & PAGEINFO_FLAGS_DICTIONARY) { continue; }
+    ColumnChunkDesc* col               = &chunks[pi->chunk_idx];
     input_column_info const& input_col = _input_columns[col->src_col_index];
 
-    int index                        = pi->nesting_decode - page_nesting_decode.device_ptr();
-    gpu::PageNestingDecodeInfo* pndi = &page_nesting_decode[index];
+    int index                   = pi->nesting_decode - page_nesting_decode.device_ptr();
+    PageNestingDecodeInfo* pndi = &page_nesting_decode[index];
 
     auto* cols = &_output_buffers;
     for (size_t l_idx = 0; l_idx < input_col.nesting_depth(); l_idx++) {
@@ -320,7 +327,7 @@ reader::impl::impl(std::size_t chunk_read_limit,
 
   // Save the states of the output buffers for reuse in `chunk_read()`.
   for (auto const& buff : _output_buffers) {
-    _output_buffers_template.emplace_back(inline_column_buffer::empty_like(buff));
+    _output_buffers_template.emplace_back(cudf::io::detail::inline_column_buffer::empty_like(buff));
   }
 }
 
@@ -349,14 +356,14 @@ void reader::impl::prepare_data(int64_t skip_rows,
         not _input_columns.empty()) {
       // fills in chunk information without physically loading or decompressing
       // the associated data
-      load_global_chunk_info();
+      create_global_chunk_info();
 
       // compute schedule of input reads. Each rowgroup contains 1 chunk per column. For now
       // we will read an entire row group at a time. However, it is possible to do
       // sub-rowgroup reads if we made some estimates on individual chunk sizes (tricky) and
       // changed the high level structure such that we weren't always reading an entire table's
       // worth of columns at once.
-      compute_input_pass_row_group_info();
+      compute_input_passes();
     }
 
     _file_preprocessed = true;
@@ -364,16 +371,16 @@ void reader::impl::prepare_data(int64_t skip_rows,
 
   // if we have to start a new pass, do that now
   if (!_pass_preprocessed) {
-    auto const num_passes = _input_pass_row_group_offsets.size() - 1;
+    auto const num_passes = _file_itm_data.input_pass_row_group_offsets.size() - 1;
 
     // always create the pass struct, even if we end up with no passes.
     // this will also cause the previous pass information to be deleted
-    _pass_itm_data = std::make_unique<cudf::io::parquet::gpu::pass_intermediate_data>();
+    _pass_itm_data = std::make_unique<pass_intermediate_data>();
 
     if (_file_itm_data.global_num_rows > 0 && not _file_itm_data.row_groups.empty() &&
         not _input_columns.empty() && _current_input_pass < num_passes) {
       // setup the pass_intermediate_info for this pass.
-      setup_pass();
+      setup_next_pass();
 
       load_and_decompress_data();
       preprocess_pages(uses_custom_row_bounds, _output_chunk_read_limit);
@@ -521,7 +528,7 @@ table_with_metadata reader::impl::read_chunk()
   if (_chunk_count > 0) {
     _output_buffers.resize(0);
     for (auto const& buff : _output_buffers_template) {
-      _output_buffers.emplace_back(inline_column_buffer::empty_like(buff));
+      _output_buffers.emplace_back(cudf::io::detail::inline_column_buffer::empty_like(buff));
     }
   }
 
@@ -541,8 +548,8 @@ bool reader::impl::has_next()
                {} /*row_group_indices, empty means read all row groups*/,
                std::nullopt /*filter*/);
 
-  auto const num_input_passes =
-    _input_pass_row_group_offsets.size() == 0 ? 0 : _input_pass_row_group_offsets.size() - 1;
+  size_t const num_input_passes = std::max(
+    int64_t{0}, static_cast<int64_t>(_file_itm_data.input_pass_row_group_offsets.size()) - 1);
   return (_pass_itm_data->current_output_chunk < _pass_itm_data->output_chunk_read_info.size()) ||
          (_current_input_pass < num_input_passes);
 }
@@ -571,4 +578,4 @@ parquet_metadata read_parquet_metadata(host_span<std::unique_ptr<datasource> con
                           metadata.get_key_value_metadata()[0]};
 }
 
-}  // namespace cudf::io::detail::parquet
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp
index 9445e4d1648..cea4ba35606 100644
--- a/cpp/src/io/parquet/reader_impl.hpp
+++ b/cpp/src/io/parquet/reader_impl.hpp
@@ -22,6 +22,7 @@
 #pragma once
 
 #include "parquet_gpu.hpp"
+#include "reader_impl_chunking.hpp"
 #include "reader_impl_helpers.hpp"
 
 #include <cudf/io/datasource.hpp>
@@ -35,7 +36,7 @@
 #include <optional>
 #include <vector>
 
-namespace cudf::io::detail::parquet {
+namespace cudf::io::parquet::detail {
 
 /**
  * @brief Implementation for Parquet reader
@@ -136,10 +137,6 @@ class reader::impl {
                     host_span<std::vector<size_type> const> row_group_indices,
                     std::optional<std::reference_wrapper<ast::expression const>> filter);
 
-  void load_global_chunk_info();
-  void compute_input_pass_row_group_info();
-  void setup_pass();
-
   /**
    * @brief Create chunk information and start file reads
    *
@@ -250,6 +247,31 @@ class reader::impl {
    */
   void decode_page_data(size_t skip_rows, size_t num_rows);
 
+  /**
+   * @brief Creates file-wide parquet chunk information.
+   *
+   * Creates information about all chunks in the file, storing it in
+   * the file-wide _file_itm_data structure.
+   */
+  void create_global_chunk_info();
+
+  /**
+   * @brief Computes all of the passes we will perform over the file.
+   */
+  void compute_input_passes();
+
+  /**
+   * @brief Close out the existing pass (if any) and prepare for the next pass.
+   */
+  void setup_next_pass();
+
+  /**
+   * @brief Given a set of pages that have had their sizes computed by nesting level and
+   * a limit on total read size, generate a set of {skip_rows, num_rows} pairs representing
+   * a set of reads that will generate output columns of total size <= `chunk_read_limit` bytes.
+   */
+  void compute_splits_for_pass();
+
  private:
   rmm::cuda_stream_view _stream;
   rmm::mr::device_memory_resource* _mr = nullptr;
@@ -261,10 +283,10 @@ class reader::impl {
   std::vector<input_column_info> _input_columns;
 
   // Buffers for generating output columns
-  std::vector<inline_column_buffer> _output_buffers;
+  std::vector<cudf::io::detail::inline_column_buffer> _output_buffers;
 
   // Buffers copied from `_output_buffers` after construction for reuse
-  std::vector<inline_column_buffer> _output_buffers_template;
+  std::vector<cudf::io::detail::inline_column_buffer> _output_buffers_template;
 
   // _output_buffers associated schema indices
   std::vector<int> _output_column_schemas;
@@ -278,27 +300,24 @@ class reader::impl {
 
   // chunked reading happens in 2 parts:
   //
-  // At the top level there is the "pass" in which we try and limit the
+  // At the top level, the entire file is divided up into "passes" omn which we try and limit the
   // total amount of temporary memory (compressed data, decompressed data) in use
   // via _input_pass_read_limit.
   //
   // Within a pass, we produce one or more chunks of output, whose maximum total
   // byte size is controlled by _output_chunk_read_limit.
 
-  cudf::io::parquet::gpu::file_intermediate_data _file_itm_data;
-  std::unique_ptr<cudf::io::parquet::gpu::pass_intermediate_data> _pass_itm_data;
-
-  // an array of offsets into _file_itm_data::global_chunks. Each pair of offsets represents
-  // the start/end of the chunks to be loaded for a given pass.
-  std::vector<std::size_t> _input_pass_row_group_offsets{};
-  std::vector<std::size_t> _input_pass_row_count{};
-  std::size_t _current_input_pass{0};
-  std::size_t _chunk_count{0};
+  file_intermediate_data _file_itm_data;
+  bool _file_preprocessed{false};
 
-  std::size_t _output_chunk_read_limit{0};
-  std::size_t _input_pass_read_limit{0};
+  std::unique_ptr<pass_intermediate_data> _pass_itm_data;
   bool _pass_preprocessed{false};
-  bool _file_preprocessed{false};
+
+  std::size_t _output_chunk_read_limit{0};  // output chunk size limit in bytes
+  std::size_t _input_pass_read_limit{0};    // input pass memory usage limit in bytes
+
+  std::size_t _current_input_pass{0};  // current input pass index
+  std::size_t _chunk_count{0};         // how many output chunks we have produced
 };
 
-}  // namespace cudf::io::detail::parquet
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
new file mode 100644
index 00000000000..213fc380a34
--- /dev/null
+++ b/cpp/src/io/parquet/reader_impl_chunking.cu
@@ -0,0 +1,599 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "reader_impl.hpp"
+#include "reader_impl_chunking.hpp"
+
+#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/utilities/integer_utils.hpp>
+
+#include <io/utilities/time_utils.cuh>
+
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/binary_search.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/sort.h>
+
+namespace cudf::io::parquet::detail {
+
+namespace {
+
+struct cumulative_row_info {
+  size_t row_count;   // cumulative row count
+  size_t size_bytes;  // cumulative size in bytes
+  int key;            // schema index
+};
+
+#if defined(CHUNKING_DEBUG)
+void print_cumulative_page_info(cudf::detail::hostdevice_vector<PageInfo>& pages,
+                                rmm::device_uvector<int32_t> const& page_index,
+                                rmm::device_uvector<cumulative_row_info> const& c_info,
+                                rmm::cuda_stream_view stream)
+{
+  pages.device_to_host_sync(stream);
+
+  printf("------------\nCumulative sizes by page\n");
+
+  std::vector<int> schemas(pages.size());
+  std::vector<int> h_page_index(pages.size());
+  CUDF_CUDA_TRY(cudaMemcpy(
+    h_page_index.data(), page_index.data(), sizeof(int) * pages.size(), cudaMemcpyDefault));
+  std::vector<cumulative_row_info> h_cinfo(pages.size());
+  CUDF_CUDA_TRY(cudaMemcpy(
+    h_cinfo.data(), c_info.data(), sizeof(cumulative_row_info) * pages.size(), cudaMemcpyDefault));
+  auto schema_iter = cudf::detail::make_counting_transform_iterator(
+    0, [&](size_type i) { return pages[h_page_index[i]].src_col_schema; });
+  thrust::copy(thrust::seq, schema_iter, schema_iter + pages.size(), schemas.begin());
+  auto last = thrust::unique(thrust::seq, schemas.begin(), schemas.end());
+  schemas.resize(last - schemas.begin());
+  printf("Num schemas: %lu\n", schemas.size());
+
+  for (size_t idx = 0; idx < schemas.size(); idx++) {
+    printf("Schema %d\n", schemas[idx]);
+    for (size_t pidx = 0; pidx < pages.size(); pidx++) {
+      auto const& page = pages[h_page_index[pidx]];
+      if (page.flags & PAGEINFO_FLAGS_DICTIONARY || page.src_col_schema != schemas[idx]) {
+        continue;
+      }
+      printf("\tP: {%lu, %lu}\n", h_cinfo[pidx].row_count, h_cinfo[pidx].size_bytes);
+    }
+  }
+}
+
+void print_cumulative_row_info(host_span<cumulative_row_info const> sizes,
+                               std::string const& label,
+                               std::optional<std::vector<chunk_read_info>> splits = std::nullopt)
+{
+  if (splits.has_value()) {
+    printf("------------\nSplits\n");
+    for (size_t idx = 0; idx < splits->size(); idx++) {
+      printf("{%lu, %lu}\n", splits.value()[idx].skip_rows, splits.value()[idx].num_rows);
+    }
+  }
+
+  printf("------------\nCumulative sizes %s\n", label.c_str());
+  for (size_t idx = 0; idx < sizes.size(); idx++) {
+    printf("{%lu, %lu, %d}", sizes[idx].row_count, sizes[idx].size_bytes, sizes[idx].key);
+    if (splits.has_value()) {
+      // if we have a split at this row count and this is the last instance of this row count
+      auto start = thrust::make_transform_iterator(
+        splits->begin(), [](chunk_read_info const& i) { return i.skip_rows; });
+      auto end               = start + splits->size();
+      auto split             = std::find(start, end, sizes[idx].row_count);
+      auto const split_index = [&]() -> int {
+        if (split != end &&
+            ((idx == sizes.size() - 1) || (sizes[idx + 1].row_count > sizes[idx].row_count))) {
+          return static_cast<int>(std::distance(start, split));
+        }
+        return idx == 0 ? 0 : -1;
+      }();
+      if (split_index >= 0) {
+        printf(" <-- split {%lu, %lu}",
+               splits.value()[split_index].skip_rows,
+               splits.value()[split_index].num_rows);
+      }
+    }
+    printf("\n");
+  }
+}
+#endif  // CHUNKING_DEBUG
+
+/**
+ * @brief Functor which reduces two cumulative_row_info structs of the same key.
+ */
+struct cumulative_row_sum {
+  cumulative_row_info operator()
+    __device__(cumulative_row_info const& a, cumulative_row_info const& b) const
+  {
+    return cumulative_row_info{a.row_count + b.row_count, a.size_bytes + b.size_bytes, a.key};
+  }
+};
+
+/**
+ * @brief Functor which computes the total data size for a given type of cudf column.
+ *
+ * In the case of strings, the return size does not include the chars themselves. That
+ * information is tracked separately (see PageInfo::str_bytes).
+ */
+struct row_size_functor {
+  __device__ size_t validity_size(size_t num_rows, bool nullable)
+  {
+    return nullable ? (cudf::util::div_rounding_up_safe(num_rows, size_t{32}) * 4) : 0;
+  }
+
+  template <typename T>
+  __device__ size_t operator()(size_t num_rows, bool nullable)
+  {
+    auto const element_size = sizeof(device_storage_type_t<T>);
+    return (element_size * num_rows) + validity_size(num_rows, nullable);
+  }
+};
+
+template <>
+__device__ size_t row_size_functor::operator()<list_view>(size_t num_rows, bool nullable)
+{
+  auto const offset_size = sizeof(size_type);
+  // NOTE: Adding the + 1 offset here isn't strictly correct.  There will only be 1 extra offset
+  // for the entire column, whereas this is adding an extra offset per page.  So we will get a
+  // small over-estimate of the real size of the order :  # of pages * 4 bytes. It seems better
+  // to overestimate size somewhat than to underestimate it and potentially generate chunks
+  // that are too large.
+  return (offset_size * (num_rows + 1)) + validity_size(num_rows, nullable);
+}
+
+template <>
+__device__ size_t row_size_functor::operator()<struct_view>(size_t num_rows, bool nullable)
+{
+  return validity_size(num_rows, nullable);
+}
+
+template <>
+__device__ size_t row_size_functor::operator()<string_view>(size_t num_rows, bool nullable)
+{
+  // only returns the size of offsets and validity. the size of the actual string chars
+  // is tracked separately.
+  auto const offset_size = sizeof(size_type);
+  // see note about offsets in the list_view template.
+  return (offset_size * (num_rows + 1)) + validity_size(num_rows, nullable);
+}
+
+/**
+ * @brief Functor which computes the total output cudf data size for all of
+ * the data in this page.
+ *
+ * Sums across all nesting levels.
+ */
+struct get_cumulative_row_info {
+  PageInfo const* const pages;
+
+  __device__ cumulative_row_info operator()(size_type index)
+  {
+    auto const& page = pages[index];
+    if (page.flags & PAGEINFO_FLAGS_DICTIONARY) {
+      return cumulative_row_info{0, 0, page.src_col_schema};
+    }
+
+    // total nested size, not counting string data
+    auto iter =
+      cudf::detail::make_counting_transform_iterator(0, [page, index] __device__(size_type i) {
+        auto const& pni = page.nesting[i];
+        return cudf::type_dispatcher(
+          data_type{pni.type}, row_size_functor{}, pni.size, pni.nullable);
+      });
+
+    size_t const row_count = static_cast<size_t>(page.nesting[0].size);
+    return {
+      row_count,
+      thrust::reduce(thrust::seq, iter, iter + page.num_output_nesting_levels) + page.str_bytes,
+      page.src_col_schema};
+  }
+};
+
+/**
+ * @brief Functor which computes the effective size of all input columns by page.
+ *
+ * For a given row, we want to find the cost of all pages for all columns involved
+ * in loading up to that row.  The complication here is that not all pages are the
+ * same size between columns. Example:
+ *
+ *              page row counts
+ * Column A:    0 <----> 100 <----> 200
+ * Column B:    0 <---------------> 200 <--------> 400
+                          |
+ * if we decide to split at row 100, we don't really know the actual amount of bytes in column B
+ * at that point.  So we have to proceed as if we are taking the bytes from all 200 rows of that
+ * page. Essentially, a conservative over-estimate of the real size.
+ */
+struct row_total_size {
+  cumulative_row_info const* c_info;
+  size_type const* key_offsets;
+  size_t num_keys;
+
+  __device__ cumulative_row_info operator()(cumulative_row_info const& i)
+  {
+    // sum sizes for each input column at this row
+    size_t sum = 0;
+    for (int idx = 0; idx < num_keys; idx++) {
+      auto const start = key_offsets[idx];
+      auto const end   = key_offsets[idx + 1];
+      auto iter        = cudf::detail::make_counting_transform_iterator(
+        0, [&] __device__(size_type i) { return c_info[i].row_count; });
+      auto const page_index =
+        thrust::lower_bound(thrust::seq, iter + start, iter + end, i.row_count) - iter;
+      sum += c_info[page_index].size_bytes;
+    }
+    return {i.row_count, sum, i.key};
+  }
+};
+
+/**
+ * @brief Given a vector of cumulative {row_count, byte_size} pairs and a chunk read
+ * limit, determine the set of splits.
+ *
+ * @param sizes Vector of cumulative {row_count, byte_size} pairs
+ * @param num_rows Total number of rows to read
+ * @param chunk_read_limit Limit on total number of bytes to be returned per read, for all columns
+ */
+std::vector<chunk_read_info> find_splits(std::vector<cumulative_row_info> const& sizes,
+                                         size_t num_rows,
+                                         size_t chunk_read_limit)
+{
+  // now we have an array of {row_count, real output bytes}. just walk through it and generate
+  // splits.
+  // TODO: come up with a clever way to do this entirely in parallel. For now, as long as batch
+  // sizes are reasonably large, this shouldn't iterate too many times
+  std::vector<chunk_read_info> splits;
+  {
+    size_t cur_pos             = 0;
+    size_t cur_cumulative_size = 0;
+    size_t cur_row_count       = 0;
+    auto start = thrust::make_transform_iterator(sizes.begin(), [&](cumulative_row_info const& i) {
+      return i.size_bytes - cur_cumulative_size;
+    });
+    auto end   = start + sizes.size();
+    while (cur_row_count < num_rows) {
+      int64_t split_pos =
+        thrust::lower_bound(thrust::seq, start + cur_pos, end, chunk_read_limit) - start;
+
+      // if we're past the end, or if the returned bucket is > than the chunk_read_limit, move back
+      // one.
+      if (static_cast<size_t>(split_pos) >= sizes.size() ||
+          (sizes[split_pos].size_bytes - cur_cumulative_size > chunk_read_limit)) {
+        split_pos--;
+      }
+
+      // best-try. if we can't find something that'll fit, we have to go bigger. we're doing this in
+      // a loop because all of the cumulative sizes for all the pages are sorted into one big list.
+      // so if we had two columns, both of which had an entry {1000, 10000}, that entry would be in
+      // the list twice. so we have to iterate until we skip past all of them.  The idea is that we
+      // either do this, or we have to call unique() on the input first.
+      while (split_pos < (static_cast<int64_t>(sizes.size()) - 1) &&
+             (split_pos < 0 || sizes[split_pos].row_count == cur_row_count)) {
+        split_pos++;
+      }
+
+      auto const start_row = cur_row_count;
+      cur_row_count        = sizes[split_pos].row_count;
+      splits.push_back(chunk_read_info{start_row, cur_row_count - start_row});
+      cur_pos             = split_pos;
+      cur_cumulative_size = sizes[split_pos].size_bytes;
+    }
+  }
+  // print_cumulative_row_info(sizes, "adjusted", splits);
+
+  return splits;
+}
+
+/**
+ * @brief Converts cuDF units to Parquet units.
+ *
+ * @return A tuple of Parquet type width, Parquet clock rate and Parquet decimal type.
+ */
+[[nodiscard]] std::tuple<int32_t, int32_t, int8_t> conversion_info(
+  type_id column_type_id,
+  type_id timestamp_type_id,
+  Type physical,
+  thrust::optional<ConvertedType> converted,
+  int32_t length)
+{
+  int32_t type_width = (physical == FIXED_LEN_BYTE_ARRAY) ? length : 0;
+  int32_t clock_rate = 0;
+  if (column_type_id == type_id::INT8 or column_type_id == type_id::UINT8) {
+    type_width = 1;  // I32 -> I8
+  } else if (column_type_id == type_id::INT16 or column_type_id == type_id::UINT16) {
+    type_width = 2;  // I32 -> I16
+  } else if (column_type_id == type_id::INT32) {
+    type_width = 4;  // str -> hash32
+  } else if (is_chrono(data_type{column_type_id})) {
+    clock_rate = to_clockrate(timestamp_type_id);
+  }
+
+  int8_t converted_type = converted.value_or(UNKNOWN);
+  if (converted_type == DECIMAL && column_type_id != type_id::FLOAT64 &&
+      not cudf::is_fixed_point(data_type{column_type_id})) {
+    converted_type = UNKNOWN;  // Not converting to float64 or decimal
+  }
+  return std::make_tuple(type_width, clock_rate, converted_type);
+}
+
+/**
+ * @brief Return the required number of bits to store a value.
+ */
+template <typename T = uint8_t>
+[[nodiscard]] T required_bits(uint32_t max_level)
+{
+  return static_cast<T>(CompactProtocolReader::NumRequiredBits(max_level));
+}
+
+struct row_count_compare {
+  __device__ bool operator()(cumulative_row_info const& a, cumulative_row_info const& b)
+  {
+    return a.row_count < b.row_count;
+  }
+};
+
+}  // anonymous namespace
+
+void reader::impl::create_global_chunk_info()
+{
+  auto const num_rows         = _file_itm_data.global_num_rows;
+  auto const& row_groups_info = _file_itm_data.row_groups;
+  auto& chunks                = _file_itm_data.chunks;
+
+  // Descriptors for all the chunks that make up the selected columns
+  auto const num_input_columns = _input_columns.size();
+  auto const num_chunks        = row_groups_info.size() * num_input_columns;
+
+  // Initialize column chunk information
+  auto remaining_rows = num_rows;
+  for (auto const& rg : row_groups_info) {
+    auto const& row_group      = _metadata->get_row_group(rg.index, rg.source_index);
+    auto const row_group_start = rg.start_row;
+    auto const row_group_rows  = std::min<int>(remaining_rows, row_group.num_rows);
+
+    // generate ColumnChunkDesc objects for everything to be decoded (all input columns)
+    for (size_t i = 0; i < num_input_columns; ++i) {
+      auto col = _input_columns[i];
+      // look up metadata
+      auto& col_meta = _metadata->get_column_metadata(rg.index, rg.source_index, col.schema_idx);
+      auto& schema   = _metadata->get_schema(col.schema_idx);
+
+      auto [type_width, clock_rate, converted_type] =
+        conversion_info(to_type_id(schema, _strings_to_categorical, _timestamp_type.id()),
+                        _timestamp_type.id(),
+                        schema.type,
+                        schema.converted_type,
+                        schema.type_length);
+
+      chunks.push_back(ColumnChunkDesc(col_meta.total_compressed_size,
+                                       nullptr,
+                                       col_meta.num_values,
+                                       schema.type,
+                                       type_width,
+                                       row_group_start,
+                                       row_group_rows,
+                                       schema.max_definition_level,
+                                       schema.max_repetition_level,
+                                       _metadata->get_output_nesting_depth(col.schema_idx),
+                                       required_bits(schema.max_definition_level),
+                                       required_bits(schema.max_repetition_level),
+                                       col_meta.codec,
+                                       converted_type,
+                                       schema.logical_type,
+                                       schema.decimal_precision,
+                                       clock_rate,
+                                       i,
+                                       col.schema_idx));
+    }
+
+    remaining_rows -= row_group_rows;
+  }
+}
+
+void reader::impl::compute_input_passes()
+{
+  // at this point, row_groups has already been filtered down to just the row groups we need to
+  // handle optional skip_rows/num_rows parameters.
+  auto const& row_groups_info = _file_itm_data.row_groups;
+
+  // if the user hasn't specified an input size limit, read everything in a single pass.
+  if (_input_pass_read_limit == 0) {
+    _file_itm_data.input_pass_row_group_offsets.push_back(0);
+    _file_itm_data.input_pass_row_group_offsets.push_back(row_groups_info.size());
+    return;
+  }
+
+  // generate passes. make sure to account for the case where a single row group doesn't fit within
+  //
+  std::size_t const read_limit =
+    _input_pass_read_limit > 0 ? _input_pass_read_limit : std::numeric_limits<std::size_t>::max();
+  std::size_t cur_pass_byte_size = 0;
+  std::size_t cur_rg_start       = 0;
+  std::size_t cur_row_count      = 0;
+  _file_itm_data.input_pass_row_group_offsets.push_back(0);
+  _file_itm_data.input_pass_row_count.push_back(0);
+
+  for (size_t cur_rg_index = 0; cur_rg_index < row_groups_info.size(); cur_rg_index++) {
+    auto const& rgi       = row_groups_info[cur_rg_index];
+    auto const& row_group = _metadata->get_row_group(rgi.index, rgi.source_index);
+
+    // can we add this row group
+    if (cur_pass_byte_size + row_group.total_byte_size >= read_limit) {
+      // A single row group (the current one) is larger than the read limit:
+      // We always need to include at least one row group, so end the pass at the end of the current
+      // row group
+      if (cur_rg_start == cur_rg_index) {
+        _file_itm_data.input_pass_row_group_offsets.push_back(cur_rg_index + 1);
+        _file_itm_data.input_pass_row_count.push_back(cur_row_count + row_group.num_rows);
+        cur_rg_start       = cur_rg_index + 1;
+        cur_pass_byte_size = 0;
+      }
+      // End the pass at the end of the previous row group
+      else {
+        _file_itm_data.input_pass_row_group_offsets.push_back(cur_rg_index);
+        _file_itm_data.input_pass_row_count.push_back(cur_row_count);
+        cur_rg_start       = cur_rg_index;
+        cur_pass_byte_size = row_group.total_byte_size;
+      }
+    } else {
+      cur_pass_byte_size += row_group.total_byte_size;
+    }
+    cur_row_count += row_group.num_rows;
+  }
+  // add the last pass if necessary
+  if (_file_itm_data.input_pass_row_group_offsets.back() != row_groups_info.size()) {
+    _file_itm_data.input_pass_row_group_offsets.push_back(row_groups_info.size());
+    _file_itm_data.input_pass_row_count.push_back(cur_row_count);
+  }
+}
+
+void reader::impl::setup_next_pass()
+{
+  // this will also cause the previous pass information to be deleted
+  _pass_itm_data = std::make_unique<cudf::io::parquet::detail::pass_intermediate_data>();
+
+  // setup row groups to be loaded for this pass
+  auto const row_group_start = _file_itm_data.input_pass_row_group_offsets[_current_input_pass];
+  auto const row_group_end   = _file_itm_data.input_pass_row_group_offsets[_current_input_pass + 1];
+  auto const num_row_groups  = row_group_end - row_group_start;
+  _pass_itm_data->row_groups.resize(num_row_groups);
+  std::copy(_file_itm_data.row_groups.begin() + row_group_start,
+            _file_itm_data.row_groups.begin() + row_group_end,
+            _pass_itm_data->row_groups.begin());
+
+  auto const num_passes = _file_itm_data.input_pass_row_group_offsets.size() - 1;
+  CUDF_EXPECTS(_current_input_pass < num_passes, "Encountered an invalid read pass index");
+
+  auto const chunks_per_rowgroup = _input_columns.size();
+  auto const num_chunks          = chunks_per_rowgroup * num_row_groups;
+
+  auto chunk_start = _file_itm_data.chunks.begin() + (row_group_start * chunks_per_rowgroup);
+  auto chunk_end   = _file_itm_data.chunks.begin() + (row_group_end * chunks_per_rowgroup);
+
+  _pass_itm_data->chunks = cudf::detail::hostdevice_vector<ColumnChunkDesc>(num_chunks, _stream);
+  std::copy(chunk_start, chunk_end, _pass_itm_data->chunks.begin());
+
+  // adjust skip_rows and num_rows by what's available in the row groups we are processing
+  if (num_passes == 1) {
+    _pass_itm_data->skip_rows = _file_itm_data.global_skip_rows;
+    _pass_itm_data->num_rows  = _file_itm_data.global_num_rows;
+  } else {
+    auto const global_start_row = _file_itm_data.global_skip_rows;
+    auto const global_end_row   = global_start_row + _file_itm_data.global_num_rows;
+    auto const start_row =
+      std::max(_file_itm_data.input_pass_row_count[_current_input_pass], global_start_row);
+    auto const end_row =
+      std::min(_file_itm_data.input_pass_row_count[_current_input_pass + 1], global_end_row);
+
+    // skip_rows is always global in the sense that it is relative to the first row of
+    // everything we will be reading, regardless of what pass we are on.
+    // num_rows is how many rows we are reading this pass.
+    _pass_itm_data->skip_rows =
+      global_start_row + _file_itm_data.input_pass_row_count[_current_input_pass];
+    _pass_itm_data->num_rows = end_row - start_row;
+  }
+}
+
+void reader::impl::compute_splits_for_pass()
+{
+  auto const skip_rows = _pass_itm_data->skip_rows;
+  auto const num_rows  = _pass_itm_data->num_rows;
+
+  // simple case : no chunk size, no splits
+  if (_output_chunk_read_limit <= 0) {
+    _pass_itm_data->output_chunk_read_info = std::vector<chunk_read_info>{{skip_rows, num_rows}};
+    return;
+  }
+
+  auto& pages = _pass_itm_data->pages_info;
+
+  auto const& page_keys  = _pass_itm_data->page_keys;
+  auto const& page_index = _pass_itm_data->page_index;
+
+  // generate cumulative row counts and sizes
+  rmm::device_uvector<cumulative_row_info> c_info(page_keys.size(), _stream);
+  // convert PageInfo to cumulative_row_info
+  auto page_input = thrust::make_transform_iterator(page_index.begin(),
+                                                    get_cumulative_row_info{pages.device_ptr()});
+  thrust::inclusive_scan_by_key(rmm::exec_policy(_stream),
+                                page_keys.begin(),
+                                page_keys.end(),
+                                page_input,
+                                c_info.begin(),
+                                thrust::equal_to{},
+                                cumulative_row_sum{});
+  // print_cumulative_page_info(pages, page_index, c_info, stream);
+
+  // sort by row count
+  rmm::device_uvector<cumulative_row_info> c_info_sorted{c_info, _stream};
+  thrust::sort(
+    rmm::exec_policy(_stream), c_info_sorted.begin(), c_info_sorted.end(), row_count_compare{});
+
+  // std::vector<cumulative_row_info> h_c_info_sorted(c_info_sorted.size());
+  // CUDF_CUDA_TRY(cudaMemcpy(h_c_info_sorted.data(),
+  //                          c_info_sorted.data(),
+  //                          sizeof(cumulative_row_info) * c_info_sorted.size(),
+  //                          cudaMemcpyDefault));
+  // print_cumulative_row_info(h_c_info_sorted, "raw");
+
+  // generate key offsets (offsets to the start of each partition of keys). worst case is 1 page per
+  // key
+  rmm::device_uvector<size_type> key_offsets(page_keys.size() + 1, _stream);
+  auto const key_offsets_end = thrust::reduce_by_key(rmm::exec_policy(_stream),
+                                                     page_keys.begin(),
+                                                     page_keys.end(),
+                                                     thrust::make_constant_iterator(1),
+                                                     thrust::make_discard_iterator(),
+                                                     key_offsets.begin())
+                                 .second;
+  size_t const num_unique_keys = key_offsets_end - key_offsets.begin();
+  thrust::exclusive_scan(
+    rmm::exec_policy(_stream), key_offsets.begin(), key_offsets.end(), key_offsets.begin());
+
+  // adjust the cumulative info such that for each row count, the size includes any pages that span
+  // that row count. this is so that if we have this case:
+  //              page row counts
+  // Column A:    0 <----> 100 <----> 200
+  // Column B:    0 <---------------> 200 <--------> 400
+  //                        |
+  // if we decide to split at row 100, we don't really know the actual amount of bytes in column B
+  // at that point.  So we have to proceed as if we are taking the bytes from all 200 rows of that
+  // page.
+  //
+  rmm::device_uvector<cumulative_row_info> aggregated_info(c_info.size(), _stream);
+  thrust::transform(rmm::exec_policy(_stream),
+                    c_info_sorted.begin(),
+                    c_info_sorted.end(),
+                    aggregated_info.begin(),
+                    row_total_size{c_info.data(), key_offsets.data(), num_unique_keys});
+
+  // bring back to the cpu
+  std::vector<cumulative_row_info> h_aggregated_info(aggregated_info.size());
+  CUDF_CUDA_TRY(cudaMemcpyAsync(h_aggregated_info.data(),
+                                aggregated_info.data(),
+                                sizeof(cumulative_row_info) * c_info.size(),
+                                cudaMemcpyDefault,
+                                _stream.value()));
+  _stream.synchronize();
+
+  // generate the actual splits
+  _pass_itm_data->output_chunk_read_info =
+    find_splits(h_aggregated_info, num_rows, _output_chunk_read_limit);
+}
+
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/reader_impl_chunking.hpp b/cpp/src/io/parquet/reader_impl_chunking.hpp
new file mode 100644
index 00000000000..dfc239d8451
--- /dev/null
+++ b/cpp/src/io/parquet/reader_impl_chunking.hpp
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "reader_impl_helpers.hpp"
+
+#include <cudf/types.hpp>
+
+namespace cudf::io::parquet::detail {
+
+/**
+ * @brief Struct to store file-level data that remains constant for
+ * all passes/chunks in the file.
+ */
+struct file_intermediate_data {
+  // all row groups to read
+  std::vector<row_group_info> row_groups{};
+
+  // all chunks from the selected row groups. We may end up reading these chunks progressively
+  // instead of all at once
+  std::vector<ColumnChunkDesc> chunks{};
+
+  // an array of offsets into _file_itm_data::global_chunks. Each pair of offsets represents
+  // the start/end of the chunks to be loaded for a given pass.
+  std::vector<std::size_t> input_pass_row_group_offsets{};
+  // row counts per input-pass
+  std::vector<std::size_t> input_pass_row_count{};
+
+  // skip_rows/num_rows values for the entire file. these need to be adjusted per-pass because we
+  // may not be visiting every row group that contains these bounds
+  size_t global_skip_rows;
+  size_t global_num_rows;
+};
+
+/**
+ * @brief Struct to identify the range for each chunk of rows during a chunked reading pass.
+ */
+struct chunk_read_info {
+  size_t skip_rows;
+  size_t num_rows;
+};
+
+/**
+ * @brief Struct to store pass-level data that remains constant for a single pass.
+ */
+struct pass_intermediate_data {
+  std::vector<std::unique_ptr<datasource::buffer>> raw_page_data;
+  rmm::device_buffer decomp_page_data;
+
+  // rowgroup, chunk and page information for the current pass.
+  std::vector<row_group_info> row_groups{};
+  cudf::detail::hostdevice_vector<ColumnChunkDesc> chunks{};
+  cudf::detail::hostdevice_vector<PageInfo> pages_info{};
+  cudf::detail::hostdevice_vector<PageNestingInfo> page_nesting_info{};
+  cudf::detail::hostdevice_vector<PageNestingDecodeInfo> page_nesting_decode_info{};
+
+  rmm::device_uvector<int32_t> page_keys{0, rmm::cuda_stream_default};
+  rmm::device_uvector<int32_t> page_index{0, rmm::cuda_stream_default};
+  rmm::device_uvector<string_index_pair> str_dict_index{0, rmm::cuda_stream_default};
+
+  std::vector<chunk_read_info> output_chunk_read_info;
+  std::size_t current_output_chunk{0};
+
+  rmm::device_buffer level_decode_data{};
+  int level_type_size{0};
+
+  // skip_rows and num_rows values for this particular pass. these may be adjusted values from the
+  // global values stored in file_intermediate_data.
+  size_t skip_rows;
+  size_t num_rows;
+};
+
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp
index fcaa610fbb7..a9c84143e1a 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.cpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.cpp
@@ -21,50 +21,48 @@
 #include <numeric>
 #include <regex>
 
-namespace cudf::io::detail::parquet {
+namespace cudf::io::parquet::detail {
 
 namespace {
 
-ConvertedType logical_type_to_converted_type(LogicalType const& logical)
+ConvertedType logical_type_to_converted_type(thrust::optional<LogicalType> const& logical)
 {
-  if (logical.isset.STRING) {
-    return parquet::UTF8;
-  } else if (logical.isset.MAP) {
-    return parquet::MAP;
-  } else if (logical.isset.LIST) {
-    return parquet::LIST;
-  } else if (logical.isset.ENUM) {
-    return parquet::ENUM;
-  } else if (logical.isset.DECIMAL) {
-    return parquet::DECIMAL;  // TODO set decimal values
-  } else if (logical.isset.DATE) {
-    return parquet::DATE;
-  } else if (logical.isset.TIME) {
-    if (logical.TIME.unit.isset.MILLIS)
-      return parquet::TIME_MILLIS;
-    else if (logical.TIME.unit.isset.MICROS)
-      return parquet::TIME_MICROS;
-  } else if (logical.isset.TIMESTAMP) {
-    if (logical.TIMESTAMP.unit.isset.MILLIS)
-      return parquet::TIMESTAMP_MILLIS;
-    else if (logical.TIMESTAMP.unit.isset.MICROS)
-      return parquet::TIMESTAMP_MICROS;
-  } else if (logical.isset.INTEGER) {
-    switch (logical.INTEGER.bitWidth) {
-      case 8: return logical.INTEGER.isSigned ? INT_8 : UINT_8;
-      case 16: return logical.INTEGER.isSigned ? INT_16 : UINT_16;
-      case 32: return logical.INTEGER.isSigned ? INT_32 : UINT_32;
-      case 64: return logical.INTEGER.isSigned ? INT_64 : UINT_64;
-      default: break;
-    }
-  } else if (logical.isset.UNKNOWN) {
-    return parquet::NA;
-  } else if (logical.isset.JSON) {
-    return parquet::JSON;
-  } else if (logical.isset.BSON) {
-    return parquet::BSON;
+  if (not logical.has_value()) { return UNKNOWN; }
+  switch (logical->type) {
+    case LogicalType::STRING: return UTF8;
+    case LogicalType::MAP: return MAP;
+    case LogicalType::LIST: return LIST;
+    case LogicalType::ENUM: return ENUM;
+    case LogicalType::DECIMAL: return DECIMAL;  // TODO use decimal scale/precision
+    case LogicalType::DATE: return DATE;
+    case LogicalType::TIME:
+      if (logical->is_time_millis()) {
+        return TIME_MILLIS;
+      } else if (logical->is_time_micros()) {
+        return TIME_MICROS;
+      }
+      break;
+    case LogicalType::TIMESTAMP:
+      if (logical->is_timestamp_millis()) {
+        return TIMESTAMP_MILLIS;
+      } else if (logical->is_timestamp_micros()) {
+        return TIMESTAMP_MICROS;
+      }
+      break;
+    case LogicalType::INTEGER:
+      switch (logical->bit_width()) {
+        case 8: return logical->is_signed() ? INT_8 : UINT_8;
+        case 16: return logical->is_signed() ? INT_16 : UINT_16;
+        case 32: return logical->is_signed() ? INT_32 : UINT_32;
+        case 64: return logical->is_signed() ? INT_64 : UINT_64;
+        default: break;
+      }
+    case LogicalType::UNKNOWN: return NA;
+    case LogicalType::JSON: return JSON;
+    case LogicalType::BSON: return BSON;
+    default: break;
   }
-  return parquet::UNKNOWN;
+  return UNKNOWN;
 }
 
 }  // namespace
@@ -76,39 +74,39 @@ type_id to_type_id(SchemaElement const& schema,
                    bool strings_to_categorical,
                    type_id timestamp_type_id)
 {
-  parquet::Type const physical            = schema.type;
-  parquet::LogicalType const logical_type = schema.logical_type;
-  parquet::ConvertedType converted_type   = schema.converted_type;
-  int32_t decimal_precision               = schema.decimal_precision;
+  auto const physical       = schema.type;
+  auto const logical_type   = schema.logical_type;
+  auto converted_type       = schema.converted_type;
+  int32_t decimal_precision = schema.decimal_precision;
 
+  // FIXME(ets): this should just use logical type to deduce the type_id. then fall back to
+  // converted_type if logical_type isn't set
   // Logical type used for actual data interpretation; the legacy converted type
   // is superseded by 'logical' type whenever available.
   auto const inferred_converted_type = logical_type_to_converted_type(logical_type);
-  if (inferred_converted_type != parquet::UNKNOWN) { converted_type = inferred_converted_type; }
-  if (inferred_converted_type == parquet::DECIMAL) {
-    decimal_precision = schema.logical_type.DECIMAL.precision;
-  }
-
-  switch (converted_type) {
-    case parquet::UINT_8: return type_id::UINT8;
-    case parquet::INT_8: return type_id::INT8;
-    case parquet::UINT_16: return type_id::UINT16;
-    case parquet::INT_16: return type_id::INT16;
-    case parquet::UINT_32: return type_id::UINT32;
-    case parquet::UINT_64: return type_id::UINT64;
-    case parquet::DATE: return type_id::TIMESTAMP_DAYS;
-    case parquet::TIME_MILLIS: return type_id::DURATION_MILLISECONDS;
-    case parquet::TIME_MICROS: return type_id::DURATION_MICROSECONDS;
-    case parquet::TIMESTAMP_MILLIS:
+  if (inferred_converted_type != UNKNOWN) { converted_type = inferred_converted_type; }
+  if (inferred_converted_type == DECIMAL) { decimal_precision = schema.logical_type->precision(); }
+
+  switch (converted_type.value_or(UNKNOWN)) {
+    case UINT_8: return type_id::UINT8;
+    case INT_8: return type_id::INT8;
+    case UINT_16: return type_id::UINT16;
+    case INT_16: return type_id::INT16;
+    case UINT_32: return type_id::UINT32;
+    case UINT_64: return type_id::UINT64;
+    case DATE: return type_id::TIMESTAMP_DAYS;
+    case TIME_MILLIS: return type_id::DURATION_MILLISECONDS;
+    case TIME_MICROS: return type_id::DURATION_MICROSECONDS;
+    case TIMESTAMP_MILLIS:
       return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id
                                                    : type_id::TIMESTAMP_MILLISECONDS;
-    case parquet::TIMESTAMP_MICROS:
+    case TIMESTAMP_MICROS:
       return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id
                                                    : type_id::TIMESTAMP_MICROSECONDS;
-    case parquet::DECIMAL:
-      if (physical == parquet::INT32) { return type_id::DECIMAL32; }
-      if (physical == parquet::INT64) { return type_id::DECIMAL64; }
-      if (physical == parquet::FIXED_LEN_BYTE_ARRAY) {
+    case DECIMAL:
+      if (physical == INT32) { return type_id::DECIMAL32; }
+      if (physical == INT64) { return type_id::DECIMAL64; }
+      if (physical == FIXED_LEN_BYTE_ARRAY) {
         if (schema.type_length <= static_cast<int32_t>(sizeof(int32_t))) {
           return type_id::DECIMAL32;
         }
@@ -119,7 +117,7 @@ type_id to_type_id(SchemaElement const& schema,
           return type_id::DECIMAL128;
         }
       }
-      if (physical == parquet::BYTE_ARRAY) {
+      if (physical == BYTE_ARRAY) {
         CUDF_EXPECTS(decimal_precision <= MAX_DECIMAL128_PRECISION, "Invalid decimal precision");
         if (decimal_precision <= MAX_DECIMAL32_PRECISION) {
           return type_id::DECIMAL32;
@@ -133,22 +131,20 @@ type_id to_type_id(SchemaElement const& schema,
       break;
 
     // maps are just List<Struct<>>.
-    case parquet::MAP:
-    case parquet::LIST: return type_id::LIST;
-    case parquet::NA: return type_id::STRING;
+    case MAP:
+    case LIST: return type_id::LIST;
+    case NA: return type_id::STRING;
     // return type_id::EMPTY; //TODO(kn): enable after Null/Empty column support
     default: break;
   }
 
-  if (inferred_converted_type == parquet::UNKNOWN and physical == parquet::INT64 and
-      logical_type.TIMESTAMP.unit.isset.NANOS) {
-    return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id
-                                                 : type_id::TIMESTAMP_NANOSECONDS;
-  }
-
-  if (inferred_converted_type == parquet::UNKNOWN and physical == parquet::INT64 and
-      logical_type.TIME.unit.isset.NANOS) {
-    return type_id::DURATION_NANOSECONDS;
+  if (inferred_converted_type == UNKNOWN and physical == INT64 and logical_type.has_value()) {
+    if (logical_type->is_timestamp_nanos()) {
+      return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id
+                                                   : type_id::TIMESTAMP_NANOSECONDS;
+    } else if (logical_type->is_time_nanos()) {
+      return type_id::DURATION_NANOSECONDS;
+    }
   }
 
   // is it simply a struct?
@@ -157,16 +153,16 @@ type_id to_type_id(SchemaElement const& schema,
   // Physical storage type supported by Parquet; controls the on-disk storage
   // format in combination with the encoding type.
   switch (physical) {
-    case parquet::BOOLEAN: return type_id::BOOL8;
-    case parquet::INT32: return type_id::INT32;
-    case parquet::INT64: return type_id::INT64;
-    case parquet::FLOAT: return type_id::FLOAT32;
-    case parquet::DOUBLE: return type_id::FLOAT64;
-    case parquet::BYTE_ARRAY:
-    case parquet::FIXED_LEN_BYTE_ARRAY:
+    case BOOLEAN: return type_id::BOOL8;
+    case INT32: return type_id::INT32;
+    case INT64: return type_id::INT64;
+    case FLOAT: return type_id::FLOAT32;
+    case DOUBLE: return type_id::FLOAT64;
+    case BYTE_ARRAY:
+    case FIXED_LEN_BYTE_ARRAY:
       // Can be mapped to INT32 (32-bit hash) or STRING
       return strings_to_categorical ? type_id::INT32 : type_id::STRING;
-    case parquet::INT96:
+    case INT96:
       return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id
                                                    : type_id::TIMESTAMP_NANOSECONDS;
     default: break;
@@ -175,6 +171,81 @@ type_id to_type_id(SchemaElement const& schema,
   return type_id::EMPTY;
 }
 
+void metadata::sanitize_schema()
+{
+  // Parquet isn't very strict about incoming metadata. Lots of things can and should be inferred.
+  // There are also a lot of rules that simply aren't followed and are expected to be worked around.
+  // This step sanitizes the metadata to something that isn't ambiguous.
+  //
+  // Take, for example, the following schema:
+  //
+  //  required group field_id=-1 user {
+  //    required int32 field_id=-1 id;
+  //    optional group field_id=-1 phoneNumbers {
+  //      repeated group field_id=-1 phone {
+  //        required int64 field_id=-1 number;
+  //        optional binary field_id=-1 kind (String);
+  //      }
+  //    }
+  //  }
+  //
+  // This real-world example has no annotations telling us what is a list or a struct. On the
+  // surface this looks like a column of id's and a column of list<struct<int64, string>>, but this
+  // actually should be interpreted as a struct<list<struct<int64, string>>>. The phoneNumbers field
+  // has to be a struct because it is a group with no repeated tag and we have no annotation. The
+  // repeated group is actually BOTH a struct due to the multiple children and a list due to
+  // repeated.
+  //
+  // This code attempts to make this less messy for the code that follows.
+
+  std::function<void(size_t)> process = [&](size_t schema_idx) -> void {
+    if (schema_idx < 0) { return; }
+    auto& schema_elem = schema[schema_idx];
+    if (schema_idx != 0 && schema_elem.type == UNDEFINED_TYPE) {
+      auto const parent_type = schema[schema_elem.parent_idx].converted_type;
+      if (schema_elem.repetition_type == REPEATED && schema_elem.num_children > 1 &&
+          parent_type != LIST && parent_type != MAP) {
+        // This is a list of structs, so we need to mark this as a list, but also
+        // add a struct child and move this element's children to the struct
+        schema_elem.converted_type  = LIST;
+        schema_elem.repetition_type = OPTIONAL;
+        auto const struct_node_idx  = static_cast<size_type>(schema.size());
+
+        SchemaElement struct_elem;
+        struct_elem.name            = "struct_node";
+        struct_elem.repetition_type = REQUIRED;
+        struct_elem.num_children    = schema_elem.num_children;
+        struct_elem.type            = UNDEFINED_TYPE;
+        struct_elem.converted_type  = UNKNOWN;
+
+        // swap children
+        struct_elem.children_idx = std::move(schema_elem.children_idx);
+        schema_elem.children_idx = {struct_node_idx};
+        schema_elem.num_children = 1;
+
+        struct_elem.max_definition_level = schema_elem.max_definition_level;
+        struct_elem.max_repetition_level = schema_elem.max_repetition_level;
+        schema_elem.max_definition_level--;
+        schema_elem.max_repetition_level = schema[schema_elem.parent_idx].max_repetition_level;
+
+        // change parent index on new node and on children
+        struct_elem.parent_idx = schema_idx;
+        for (auto& child_idx : struct_elem.children_idx) {
+          schema[child_idx].parent_idx = struct_node_idx;
+        }
+        // add our struct
+        schema.push_back(struct_elem);
+      }
+    }
+
+    for (auto& child_idx : schema_elem.children_idx) {
+      process(child_idx);
+    }
+  };
+
+  process(0);
+}
+
 metadata::metadata(datasource* source)
 {
   constexpr auto header_len = sizeof(file_header_s);
@@ -195,6 +266,7 @@ metadata::metadata(datasource* source)
   CompactProtocolReader cp(buffer->data(), ender->footer_len);
   CUDF_EXPECTS(cp.read(this), "Cannot parse metadata");
   CUDF_EXPECTS(cp.InitSchema(this), "Cannot initialize schema");
+  sanitize_schema();
 }
 
 std::vector<metadata> aggregate_reader_metadata::metadatas_from_sources(
@@ -344,7 +416,7 @@ std::vector<std::string> aggregate_reader_metadata::get_pandas_index_names() con
   return names;
 }
 
-std::tuple<int64_t, size_type, std::vector<gpu::row_group_info>>
+std::tuple<int64_t, size_type, std::vector<row_group_info>>
 aggregate_reader_metadata::select_row_groups(
   host_span<std::vector<size_type> const> row_group_indices,
   int64_t skip_rows_opt,
@@ -362,7 +434,7 @@ aggregate_reader_metadata::select_row_groups(
         host_span<std::vector<size_type> const>(filtered_row_group_indices.value());
     }
   }
-  std::vector<gpu::row_group_info> selection;
+  std::vector<row_group_info> selection;
   auto [rows_to_skip, rows_to_read] = [&]() {
     if (not row_group_indices.empty()) { return std::pair<int64_t, size_type>{}; }
     auto const from_opts = cudf::io::detail::skip_rows_num_rows_from_options(
@@ -402,7 +474,7 @@ aggregate_reader_metadata::select_row_groups(
 }
 
 std::tuple<std::vector<input_column_info>,
-           std::vector<inline_column_buffer>,
+           std::vector<cudf::io::detail::inline_column_buffer>,
            std::vector<size_type>>
 aggregate_reader_metadata::select_columns(std::optional<std::vector<std::string>> const& use_names,
                                           bool include_index,
@@ -420,17 +492,18 @@ aggregate_reader_metadata::select_columns(std::optional<std::vector<std::string>
              : -1;
   };
 
-  std::vector<inline_column_buffer> output_columns;
+  std::vector<cudf::io::detail::inline_column_buffer> output_columns;
   std::vector<input_column_info> input_columns;
   std::vector<int> nesting;
 
   // Return true if column path is valid. e.g. if the path is {"struct1", "child1"}, then it is
   // valid if "struct1.child1" exists in this file's schema. If "struct1" exists but "child1" is
   // not a child of "struct1" then the function will return false for "struct1"
-  std::function<bool(column_name_info const*, int, std::vector<inline_column_buffer>&, bool)>
+  std::function<bool(
+    column_name_info const*, int, std::vector<cudf::io::detail::inline_column_buffer>&, bool)>
     build_column = [&](column_name_info const* col_name_info,
                        int schema_idx,
-                       std::vector<inline_column_buffer>& out_col_array,
+                       std::vector<cudf::io::detail::inline_column_buffer>& out_col_array,
                        bool has_list_parent) {
       if (schema_idx < 0) { return false; }
       auto const& schema_elem = get_schema(schema_idx);
@@ -445,13 +518,16 @@ aggregate_reader_metadata::select_columns(std::optional<std::vector<std::string>
           child_col_name_info, schema_elem.children_idx[0], out_col_array, has_list_parent);
       }
 
+      auto const one_level_list = schema_elem.is_one_level_list(get_schema(schema_elem.parent_idx));
+
       // if we're at the root, this is a new output column
-      auto const col_type = schema_elem.is_one_level_list(get_schema(schema_elem.parent_idx))
+      auto const col_type = one_level_list
                               ? type_id::LIST
                               : to_type_id(schema_elem, strings_to_categorical, timestamp_type_id);
       auto const dtype    = to_data_type(col_type, schema_elem);
 
-      inline_column_buffer output_col(dtype, schema_elem.repetition_type == OPTIONAL);
+      cudf::io::detail::inline_column_buffer output_col(dtype,
+                                                        schema_elem.repetition_type == OPTIONAL);
       if (has_list_parent) { output_col.user_data |= PARQUET_COLUMN_BUFFER_FLAG_HAS_LIST_PARENT; }
       // store the index of this element if inserted in out_col_array
       nesting.push_back(static_cast<int>(out_col_array.size()));
@@ -485,13 +561,14 @@ aggregate_reader_metadata::select_columns(std::optional<std::vector<std::string>
           input_column_info{schema_idx, schema_elem.name, schema_elem.max_repetition_level > 0});
 
         // set up child output column for one-level encoding list
-        if (schema_elem.is_one_level_list(get_schema(schema_elem.parent_idx))) {
+        if (one_level_list) {
           // determine the element data type
           auto const element_type =
             to_type_id(schema_elem, strings_to_categorical, timestamp_type_id);
           auto const element_dtype = to_data_type(element_type, schema_elem);
 
-          inline_column_buffer element_col(element_dtype, schema_elem.repetition_type == OPTIONAL);
+          cudf::io::detail::inline_column_buffer element_col(
+            element_dtype, schema_elem.repetition_type == OPTIONAL);
           if (has_list_parent || col_type == type_id::LIST) {
             element_col.user_data |= PARQUET_COLUMN_BUFFER_FLAG_HAS_LIST_PARENT;
           }
@@ -506,9 +583,7 @@ aggregate_reader_metadata::select_columns(std::optional<std::vector<std::string>
         std::copy(nesting.cbegin(), nesting.cend(), std::back_inserter(input_col.nesting));
 
         // pop off the extra nesting element.
-        if (schema_elem.is_one_level_list(get_schema(schema_elem.parent_idx))) {
-          nesting.pop_back();
-        }
+        if (one_level_list) { nesting.pop_back(); }
 
         path_is_valid = true;  // If we're able to reach leaf then path is valid
       }
@@ -656,4 +731,4 @@ aggregate_reader_metadata::select_columns(std::optional<std::vector<std::string>
     std::move(input_columns), std::move(output_columns), std::move(output_column_schemas));
 }
 
-}  // namespace cudf::io::detail::parquet
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/reader_impl_helpers.hpp b/cpp/src/io/parquet/reader_impl_helpers.hpp
index 61e4f94df0f..8d8ab8707be 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.hpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.hpp
@@ -32,9 +32,24 @@
 #include <tuple>
 #include <vector>
 
-namespace cudf::io::detail::parquet {
+namespace cudf::io::parquet::detail {
 
-using namespace cudf::io::parquet;
+/**
+ * @brief The row_group_info class
+ */
+struct row_group_info {
+  size_type index;  // row group index within a file. aggregate_reader_metadata::get_row_group() is
+                    // called with index and source_index
+  size_t start_row;
+  size_type source_index;  // file index.
+
+  row_group_info() = default;
+
+  row_group_info(size_type index, size_t start_row, size_type source_index)
+    : index{index}, start_row{start_row}, source_index{source_index}
+  {
+  }
+};
 
 /**
  * @brief Function that translates Parquet datatype to cuDF type enum
@@ -58,6 +73,7 @@ using namespace cudf::io::parquet;
  */
 struct metadata : public FileMetaData {
   explicit metadata(datasource* source);
+  void sanitize_schema();
 };
 
 class aggregate_reader_metadata {
@@ -181,7 +197,7 @@ class aggregate_reader_metadata {
    * @return A tuple of corrected row_start, row_count and list of row group indexes and its
    *         starting row
    */
-  [[nodiscard]] std::tuple<int64_t, size_type, std::vector<gpu::row_group_info>> select_row_groups(
+  [[nodiscard]] std::tuple<int64_t, size_type, std::vector<row_group_info>> select_row_groups(
     host_span<std::vector<size_type> const> row_group_indices,
     int64_t row_start,
     std::optional<size_type> const& row_count,
@@ -201,12 +217,13 @@ class aggregate_reader_metadata {
    * @return input column information, output column information, list of output column schema
    * indices
    */
-  [[nodiscard]] std::
-    tuple<std::vector<input_column_info>, std::vector<inline_column_buffer>, std::vector<size_type>>
-    select_columns(std::optional<std::vector<std::string>> const& use_names,
-                   bool include_index,
-                   bool strings_to_categorical,
-                   type_id timestamp_type_id) const;
+  [[nodiscard]] std::tuple<std::vector<input_column_info>,
+                           std::vector<cudf::io::detail::inline_column_buffer>,
+                           std::vector<size_type>>
+  select_columns(std::optional<std::vector<std::string>> const& use_names,
+                 bool include_index,
+                 bool strings_to_categorical,
+                 type_id timestamp_type_id) const;
 };
 
 /**
@@ -275,4 +292,4 @@ class named_to_reference_converter : public ast::detail::expression_transformer
   std::list<ast::operation> _operators;
 };
 
-}  // namespace cudf::io::detail::parquet
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index c731c467f2c..0bc492546e9 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -14,11 +14,11 @@
  * limitations under the License.
  */
 
+#include "error.hpp"
 #include "reader_impl.hpp"
 
 #include <io/comp/nvcomp_adapter.hpp>
 #include <io/utilities/config_utils.hpp>
-#include <io/utilities/time_utils.cuh>
 
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/utilities/integer_utils.hpp>
@@ -43,7 +43,7 @@
 
 #include <numeric>
 
-namespace cudf::io::detail::parquet {
+namespace cudf::io::parquet::detail {
 namespace {
 
 /**
@@ -169,46 +169,6 @@ void generate_depth_remappings(std::map<int, std::pair<std::vector<int>, std::ve
   }
 }
 
-/**
- * @brief Return the required number of bits to store a value.
- */
-template <typename T = uint8_t>
-[[nodiscard]] T required_bits(uint32_t max_level)
-{
-  return static_cast<T>(CompactProtocolReader::NumRequiredBits(max_level));
-}
-
-/**
- * @brief Converts cuDF units to Parquet units.
- *
- * @return A tuple of Parquet type width, Parquet clock rate and Parquet decimal type.
- */
-[[nodiscard]] std::tuple<int32_t, int32_t, int8_t> conversion_info(type_id column_type_id,
-                                                                   type_id timestamp_type_id,
-                                                                   parquet::Type physical,
-                                                                   int8_t converted,
-                                                                   int32_t length)
-{
-  int32_t type_width = (physical == parquet::FIXED_LEN_BYTE_ARRAY) ? length : 0;
-  int32_t clock_rate = 0;
-  if (column_type_id == type_id::INT8 or column_type_id == type_id::UINT8) {
-    type_width = 1;  // I32 -> I8
-  } else if (column_type_id == type_id::INT16 or column_type_id == type_id::UINT16) {
-    type_width = 2;  // I32 -> I16
-  } else if (column_type_id == type_id::INT32) {
-    type_width = 4;  // str -> hash32
-  } else if (is_chrono(data_type{column_type_id})) {
-    clock_rate = to_clockrate(timestamp_type_id);
-  }
-
-  int8_t converted_type = converted;
-  if (converted_type == parquet::DECIMAL && column_type_id != type_id::FLOAT64 &&
-      not cudf::is_fixed_point(data_type{column_type_id})) {
-    converted_type = parquet::UNKNOWN;  // Not converting to float64 or decimal
-  }
-  return std::make_tuple(type_width, clock_rate, converted_type);
-}
-
 /**
  * @brief Reads compressed page data to device memory.
  *
@@ -226,7 +186,7 @@ template <typename T = uint8_t>
 [[nodiscard]] std::future<void> read_column_chunks_async(
   std::vector<std::unique_ptr<datasource>> const& sources,
   std::vector<std::unique_ptr<datasource::buffer>>& page_data,
-  cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc>& chunks,
+  cudf::detail::hostdevice_vector<ColumnChunkDesc>& chunks,
   size_t begin_chunk,
   size_t end_chunk,
   std::vector<size_t> const& column_chunk_offsets,
@@ -239,11 +199,10 @@ template <typename T = uint8_t>
     size_t const io_offset   = column_chunk_offsets[chunk];
     size_t io_size           = chunks[chunk].compressed_size;
     size_t next_chunk        = chunk + 1;
-    bool const is_compressed = (chunks[chunk].codec != parquet::Compression::UNCOMPRESSED);
+    bool const is_compressed = (chunks[chunk].codec != Compression::UNCOMPRESSED);
     while (next_chunk < end_chunk) {
-      size_t const next_offset = column_chunk_offsets[next_chunk];
-      bool const is_next_compressed =
-        (chunks[next_chunk].codec != parquet::Compression::UNCOMPRESSED);
+      size_t const next_offset      = column_chunk_offsets[next_chunk];
+      bool const is_next_compressed = (chunks[next_chunk].codec != Compression::UNCOMPRESSED);
       if (next_offset != io_offset + io_size || is_next_compressed != is_compressed ||
           chunk_source_map[chunk] != chunk_source_map[next_chunk]) {
         // Can't merge if not contiguous or mixing compressed and uncompressed
@@ -300,15 +259,20 @@ template <typename T = uint8_t>
  *
  * @return The total number of pages
  */
-[[nodiscard]] size_t count_page_headers(
-  cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc>& chunks, rmm::cuda_stream_view stream)
+[[nodiscard]] size_t count_page_headers(cudf::detail::hostdevice_vector<ColumnChunkDesc>& chunks,
+                                        rmm::cuda_stream_view stream)
 {
   size_t total_pages = 0;
 
+  kernel_error error_code(stream);
   chunks.host_to_device_async(stream);
-  gpu::DecodePageHeaders(chunks.device_ptr(), chunks.size(), stream);
+  DecodePageHeaders(chunks.device_ptr(), chunks.size(), error_code.data(), stream);
   chunks.device_to_host_sync(stream);
 
+  if (error_code.value() != 0) {
+    CUDF_FAIL("Parquet header parsing failed with code(s) " + error_code.str());
+  }
+
   for (size_t c = 0; c < chunks.size(); c++) {
     total_pages += chunks[c].num_data_pages + chunks[c].num_dict_pages;
   }
@@ -316,19 +280,6 @@ template <typename T = uint8_t>
   return total_pages;
 }
 
-// see setupLocalPageInfo() in page_data.cu for supported page encodings
-constexpr bool is_supported_encoding(Encoding enc)
-{
-  switch (enc) {
-    case Encoding::PLAIN:
-    case Encoding::PLAIN_DICTIONARY:
-    case Encoding::RLE:
-    case Encoding::RLE_DICTIONARY:
-    case Encoding::DELTA_BINARY_PACKED: return true;
-    default: return false;
-  }
-}
-
 /**
  * @brief Decode the page information from the given column chunks.
  *
@@ -337,8 +288,8 @@ constexpr bool is_supported_encoding(Encoding enc)
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @returns The size in bytes of level type data required
  */
-int decode_page_headers(cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc>& chunks,
-                        cudf::detail::hostdevice_vector<gpu::PageInfo>& pages,
+int decode_page_headers(cudf::detail::hostdevice_vector<ColumnChunkDesc>& chunks,
+                        cudf::detail::hostdevice_vector<PageInfo>& pages,
                         rmm::cuda_stream_view stream)
 {
   // IMPORTANT : if you change how pages are stored within a chunk (dist pages, then data pages),
@@ -349,33 +300,30 @@ int decode_page_headers(cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc>& c
     page_count += chunks[c].max_num_pages;
   }
 
+  kernel_error error_code(stream);
   chunks.host_to_device_async(stream);
-  gpu::DecodePageHeaders(chunks.device_ptr(), chunks.size(), stream);
+  DecodePageHeaders(chunks.device_ptr(), chunks.size(), error_code.data(), stream);
+
+  if (error_code.value() != 0) {
+    // TODO(ets): if an unsupported encoding was detected, do extra work to figure out which one
+    CUDF_FAIL("Parquet header parsing failed with code(s)" + error_code.str());
+  }
 
   // compute max bytes needed for level data
-  auto level_bit_size =
-    cudf::detail::make_counting_transform_iterator(0, [chunks = chunks.begin()] __device__(int i) {
+  auto level_bit_size = cudf::detail::make_counting_transform_iterator(
+    0, [chunks = chunks.d_begin()] __device__(int i) {
       auto c = chunks[i];
       return static_cast<int>(
-        max(c.level_bits[gpu::level_type::REPETITION], c.level_bits[gpu::level_type::DEFINITION]));
+        max(c.level_bits[level_type::REPETITION], c.level_bits[level_type::DEFINITION]));
     });
   // max level data bit size.
-  int const max_level_bits   = thrust::reduce(rmm::exec_policy(stream),
+  int const max_level_bits = thrust::reduce(rmm::exec_policy(stream),
                                             level_bit_size,
                                             level_bit_size + chunks.size(),
                                             0,
                                             thrust::maximum<int>());
-  auto const level_type_size = std::max(1, cudf::util::div_rounding_up_safe(max_level_bits, 8));
-
-  pages.device_to_host_sync(stream);
-
-  // validate page encodings
-  CUDF_EXPECTS(std::all_of(pages.begin(),
-                           pages.end(),
-                           [](auto const& page) { return is_supported_encoding(page.encoding); }),
-               "Unsupported page encoding detected");
 
-  return level_type_size;
+  return std::max(1, cudf::util::div_rounding_up_safe(max_level_bits, 8));
 }
 
 /**
@@ -388,11 +336,11 @@ int decode_page_headers(cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc>& c
  * @return Device buffer to decompressed page data
  */
 [[nodiscard]] rmm::device_buffer decompress_page_data(
-  cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc>& chunks,
-  cudf::detail::hostdevice_vector<gpu::PageInfo>& pages,
+  cudf::detail::hostdevice_vector<ColumnChunkDesc>& chunks,
+  cudf::detail::hostdevice_vector<PageInfo>& pages,
   rmm::cuda_stream_view stream)
 {
-  auto for_each_codec_page = [&](parquet::Compression codec, std::function<void(size_t)> const& f) {
+  auto for_each_codec_page = [&](Compression codec, std::function<void(size_t)> const& f) {
     for (size_t c = 0, page_count = 0; c < chunks.size(); c++) {
       const auto page_stride = chunks[c].max_num_pages;
       if (chunks[c].codec == codec) {
@@ -412,19 +360,16 @@ int decode_page_headers(cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc>& c
   size_t total_decomp_size = 0;
 
   struct codec_stats {
-    parquet::Compression compression_type = UNCOMPRESSED;
-    size_t num_pages                      = 0;
-    int32_t max_decompressed_size         = 0;
-    size_t total_decomp_size              = 0;
+    Compression compression_type  = UNCOMPRESSED;
+    size_t num_pages              = 0;
+    int32_t max_decompressed_size = 0;
+    size_t total_decomp_size      = 0;
   };
 
-  std::array codecs{codec_stats{parquet::GZIP},
-                    codec_stats{parquet::SNAPPY},
-                    codec_stats{parquet::BROTLI},
-                    codec_stats{parquet::ZSTD}};
+  std::array codecs{codec_stats{GZIP}, codec_stats{SNAPPY}, codec_stats{BROTLI}, codec_stats{ZSTD}};
 
   auto is_codec_supported = [&codecs](int8_t codec) {
-    if (codec == parquet::UNCOMPRESSED) return true;
+    if (codec == UNCOMPRESSED) return true;
     return std::find_if(codecs.begin(), codecs.end(), [codec](auto& cstats) {
              return codec == cstats.compression_type;
            }) != codecs.end();
@@ -445,7 +390,7 @@ int decode_page_headers(cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc>& c
       codec.num_pages++;
       num_comp_pages++;
     });
-    if (codec.compression_type == parquet::BROTLI && codec.num_pages > 0) {
+    if (codec.compression_type == BROTLI && codec.num_pages > 0) {
       debrotli_scratch.resize(get_gpu_debrotli_scratch_size(codec.num_pages), stream);
     }
   }
@@ -482,7 +427,7 @@ int decode_page_headers(cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc>& c
       auto& page          = pages[page_idx];
       // offset will only be non-zero for V2 pages
       auto const offset =
-        page.lvl_bytes[gpu::level_type::DEFINITION] + page.lvl_bytes[gpu::level_type::REPETITION];
+        page.lvl_bytes[level_type::DEFINITION] + page.lvl_bytes[level_type::REPETITION];
       // for V2 need to copy def and rep level info into place, and then offset the
       // input and output buffers. otherwise we'd have to keep both the compressed
       // and decompressed data.
@@ -509,11 +454,11 @@ int decode_page_headers(cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc>& c
     device_span<compression_result> d_comp_res_view(comp_res.data() + start_pos, codec.num_pages);
 
     switch (codec.compression_type) {
-      case parquet::GZIP:
+      case GZIP:
         gpuinflate(d_comp_in, d_comp_out, d_comp_res_view, gzip_header_included::YES, stream);
         break;
-      case parquet::SNAPPY:
-        if (nvcomp_integration::is_stable_enabled()) {
+      case SNAPPY:
+        if (cudf::io::detail::nvcomp_integration::is_stable_enabled()) {
           nvcomp::batched_decompress(nvcomp::compression_type::SNAPPY,
                                      d_comp_in,
                                      d_comp_out,
@@ -525,7 +470,7 @@ int decode_page_headers(cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc>& c
           gpu_unsnap(d_comp_in, d_comp_out, d_comp_res_view, stream);
         }
         break;
-      case parquet::ZSTD:
+      case ZSTD:
         nvcomp::batched_decompress(nvcomp::compression_type::ZSTD,
                                    d_comp_in,
                                    d_comp_out,
@@ -534,7 +479,7 @@ int decode_page_headers(cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc>& c
                                    codec.total_decomp_size,
                                    stream);
         break;
-      case parquet::BROTLI:
+      case BROTLI:
         gpu_debrotli(d_comp_in,
                      d_comp_out,
                      d_comp_res_view,
@@ -594,9 +539,9 @@ void reader::impl::allocate_nesting_info()
     });
 
   page_nesting_info =
-    cudf::detail::hostdevice_vector<gpu::PageNestingInfo>{total_page_nesting_infos, _stream};
+    cudf::detail::hostdevice_vector<PageNestingInfo>{total_page_nesting_infos, _stream};
   page_nesting_decode_info =
-    cudf::detail::hostdevice_vector<gpu::PageNestingDecodeInfo>{total_page_nesting_infos, _stream};
+    cudf::detail::hostdevice_vector<PageNestingDecodeInfo>{total_page_nesting_infos, _stream};
 
   // update pointers in the PageInfos
   int target_page_index = 0;
@@ -653,10 +598,10 @@ void reader::impl::allocate_nesting_info()
       if (!cur_schema.is_stub()) {
         // initialize each page within the chunk
         for (int p_idx = 0; p_idx < chunks[idx].num_data_pages; p_idx++) {
-          gpu::PageNestingInfo* pni =
+          PageNestingInfo* pni =
             &page_nesting_info[nesting_info_index + (p_idx * per_page_nesting_info_size)];
 
-          gpu::PageNestingDecodeInfo* nesting_info =
+          PageNestingDecodeInfo* nesting_info =
             &page_nesting_decode_info[nesting_info_index + (p_idx * per_page_nesting_info_size)];
 
           // if we have lists, set our start and end depth remappings
@@ -717,9 +662,9 @@ void reader::impl::allocate_level_decode_space()
   for (size_t idx = 0; idx < pages.size(); idx++) {
     auto& p = pages[idx];
 
-    p.lvl_decode_buf[gpu::level_type::DEFINITION] = buf;
+    p.lvl_decode_buf[level_type::DEFINITION] = buf;
     buf += (LEVEL_DECODE_BUF_SIZE * _pass_itm_data->level_type_size);
-    p.lvl_decode_buf[gpu::level_type::REPETITION] = buf;
+    p.lvl_decode_buf[level_type::REPETITION] = buf;
     buf += (LEVEL_DECODE_BUF_SIZE * _pass_itm_data->level_type_size);
   }
 }
@@ -793,164 +738,6 @@ std::pair<bool, std::vector<std::future<void>>> reader::impl::read_and_decompres
   return {total_decompressed_size > 0, std::move(read_chunk_tasks)};
 }
 
-void reader::impl::load_global_chunk_info()
-{
-  auto const num_rows         = _file_itm_data.global_num_rows;
-  auto const& row_groups_info = _file_itm_data.row_groups;
-  auto& chunks                = _file_itm_data.chunks;
-
-  // Descriptors for all the chunks that make up the selected columns
-  auto const num_input_columns = _input_columns.size();
-  auto const num_chunks        = row_groups_info.size() * num_input_columns;
-
-  // Initialize column chunk information
-  auto remaining_rows = num_rows;
-  for (auto const& rg : row_groups_info) {
-    auto const& row_group      = _metadata->get_row_group(rg.index, rg.source_index);
-    auto const row_group_start = rg.start_row;
-    auto const row_group_rows  = std::min<int>(remaining_rows, row_group.num_rows);
-
-    // generate ColumnChunkDesc objects for everything to be decoded (all input columns)
-    for (size_t i = 0; i < num_input_columns; ++i) {
-      auto col = _input_columns[i];
-      // look up metadata
-      auto& col_meta = _metadata->get_column_metadata(rg.index, rg.source_index, col.schema_idx);
-      auto& schema   = _metadata->get_schema(col.schema_idx);
-
-      auto [type_width, clock_rate, converted_type] =
-        conversion_info(to_type_id(schema, _strings_to_categorical, _timestamp_type.id()),
-                        _timestamp_type.id(),
-                        schema.type,
-                        schema.converted_type,
-                        schema.type_length);
-
-      chunks.push_back(gpu::ColumnChunkDesc(col_meta.total_compressed_size,
-                                            nullptr,
-                                            col_meta.num_values,
-                                            schema.type,
-                                            type_width,
-                                            row_group_start,
-                                            row_group_rows,
-                                            schema.max_definition_level,
-                                            schema.max_repetition_level,
-                                            _metadata->get_output_nesting_depth(col.schema_idx),
-                                            required_bits(schema.max_definition_level),
-                                            required_bits(schema.max_repetition_level),
-                                            col_meta.codec,
-                                            converted_type,
-                                            schema.logical_type,
-                                            schema.decimal_precision,
-                                            clock_rate,
-                                            i,
-                                            col.schema_idx));
-    }
-
-    remaining_rows -= row_group_rows;
-  }
-}
-
-void reader::impl::compute_input_pass_row_group_info()
-{
-  // at this point, row_groups has already been filtered down to just the row groups we need to
-  // handle optional skip_rows/num_rows parameters.
-  auto const& row_groups_info = _file_itm_data.row_groups;
-
-  // if the user hasn't specified an input size limit, read everything in a single pass.
-  if (_input_pass_read_limit == 0) {
-    _input_pass_row_group_offsets.push_back(0);
-    _input_pass_row_group_offsets.push_back(row_groups_info.size());
-    return;
-  }
-
-  // generate passes. make sure to account for the case where a single row group doesn't fit within
-  //
-  std::size_t const read_limit =
-    _input_pass_read_limit > 0 ? _input_pass_read_limit : std::numeric_limits<std::size_t>::max();
-  std::size_t cur_pass_byte_size = 0;
-  std::size_t cur_rg_start       = 0;
-  std::size_t cur_row_count      = 0;
-  _input_pass_row_group_offsets.push_back(0);
-  _input_pass_row_count.push_back(0);
-
-  for (size_t cur_rg_index = 0; cur_rg_index < row_groups_info.size(); cur_rg_index++) {
-    auto const& rgi       = row_groups_info[cur_rg_index];
-    auto const& row_group = _metadata->get_row_group(rgi.index, rgi.source_index);
-
-    // can we add this row group
-    if (cur_pass_byte_size + row_group.total_byte_size >= read_limit) {
-      // A single row group (the current one) is larger than the read limit:
-      // We always need to include at least one row group, so end the pass at the end of the current
-      // row group
-      if (cur_rg_start == cur_rg_index) {
-        _input_pass_row_group_offsets.push_back(cur_rg_index + 1);
-        _input_pass_row_count.push_back(cur_row_count + row_group.num_rows);
-        cur_rg_start       = cur_rg_index + 1;
-        cur_pass_byte_size = 0;
-      }
-      // End the pass at the end of the previous row group
-      else {
-        _input_pass_row_group_offsets.push_back(cur_rg_index);
-        _input_pass_row_count.push_back(cur_row_count);
-        cur_rg_start       = cur_rg_index;
-        cur_pass_byte_size = row_group.total_byte_size;
-      }
-    } else {
-      cur_pass_byte_size += row_group.total_byte_size;
-    }
-    cur_row_count += row_group.num_rows;
-  }
-  // add the last pass if necessary
-  if (_input_pass_row_group_offsets.back() != row_groups_info.size()) {
-    _input_pass_row_group_offsets.push_back(row_groups_info.size());
-    _input_pass_row_count.push_back(cur_row_count);
-  }
-}
-
-void reader::impl::setup_pass()
-{
-  // this will also cause the previous pass information to be deleted
-  _pass_itm_data = std::make_unique<cudf::io::parquet::gpu::pass_intermediate_data>();
-
-  // setup row groups to be loaded for this pass
-  auto const row_group_start = _input_pass_row_group_offsets[_current_input_pass];
-  auto const row_group_end   = _input_pass_row_group_offsets[_current_input_pass + 1];
-  auto const num_row_groups  = row_group_end - row_group_start;
-  _pass_itm_data->row_groups.resize(num_row_groups);
-  std::copy(_file_itm_data.row_groups.begin() + row_group_start,
-            _file_itm_data.row_groups.begin() + row_group_end,
-            _pass_itm_data->row_groups.begin());
-
-  auto const num_passes = _input_pass_row_group_offsets.size() - 1;
-  CUDF_EXPECTS(_current_input_pass < num_passes, "Encountered an invalid read pass index");
-
-  auto const chunks_per_rowgroup = _input_columns.size();
-  auto const num_chunks          = chunks_per_rowgroup * num_row_groups;
-
-  auto chunk_start = _file_itm_data.chunks.begin() + (row_group_start * chunks_per_rowgroup);
-  auto chunk_end   = _file_itm_data.chunks.begin() + (row_group_end * chunks_per_rowgroup);
-
-  _pass_itm_data->chunks =
-    cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc>(num_chunks, _stream);
-  std::copy(chunk_start, chunk_end, _pass_itm_data->chunks.begin());
-
-  // adjust skip_rows and num_rows by what's available in the row groups we are processing
-  if (num_passes == 1) {
-    _pass_itm_data->skip_rows = _file_itm_data.global_skip_rows;
-    _pass_itm_data->num_rows  = _file_itm_data.global_num_rows;
-  } else {
-    auto const global_start_row = _file_itm_data.global_skip_rows;
-    auto const global_end_row   = global_start_row + _file_itm_data.global_num_rows;
-    auto const start_row = std::max(_input_pass_row_count[_current_input_pass], global_start_row);
-    auto const end_row   = std::min(_input_pass_row_count[_current_input_pass + 1], global_end_row);
-
-    // skip_rows is always global in the sense that it is relative to the first row of
-    // everything we will be reading, regardless of what pass we are on.
-    // num_rows is how many rows we are reading this pass.
-    _pass_itm_data->skip_rows = global_start_row + _input_pass_row_count[_current_input_pass];
-    _pass_itm_data->num_rows  = end_row - start_row;
-  }
-}
-
 void reader::impl::load_and_decompress_data()
 {
   // This function should never be called if `num_rows == 0`.
@@ -970,15 +757,16 @@ void reader::impl::load_and_decompress_data()
   // Process dataset chunk pages into output columns
   auto const total_pages = count_page_headers(chunks, _stream);
   if (total_pages <= 0) { return; }
-  pages = cudf::detail::hostdevice_vector<gpu::PageInfo>(total_pages, total_pages, _stream);
+  pages = cudf::detail::hostdevice_vector<PageInfo>(total_pages, total_pages, _stream);
 
   // decoding of column/page information
   _pass_itm_data->level_type_size = decode_page_headers(chunks, pages, _stream);
+  pages.device_to_host_sync(_stream);
   if (has_compressed_data) {
     decomp_page_data = decompress_page_data(chunks, pages, _stream);
     // Free compressed data
     for (size_t c = 0; c < chunks.size(); c++) {
-      if (chunks[c].codec != parquet::Compression::UNCOMPRESSED) { raw_page_data[c].reset(); }
+      if (chunks[c].codec != Compression::UNCOMPRESSED) { raw_page_data[c].reset(); }
     }
   }
 
@@ -998,7 +786,6 @@ void reader::impl::load_and_decompress_data()
   // std::vector<output_column_info> output_info = build_output_column_info();
 
   // the following two allocate functions modify the page data
-  pages.device_to_host_sync(_stream);
   {
     // nesting information (sizes, etc) stored -per page-
     // note : even for flat schemas, we allocate 1 level of "nesting" info
@@ -1019,14 +806,13 @@ struct cumulative_row_info {
 };
 
 #if defined(PREPROCESS_DEBUG)
-void print_pages(cudf::detail::hostdevice_vector<gpu::PageInfo>& pages,
-                 rmm::cuda_stream_view _stream)
+void print_pages(cudf::detail::hostdevice_vector<PageInfo>& pages, rmm::cuda_stream_view _stream)
 {
   pages.device_to_host_sync(_stream);
   for (size_t idx = 0; idx < pages.size(); idx++) {
     auto const& p = pages[idx];
     // skip dictionary pages
-    if (p.flags & gpu::PAGEINFO_FLAGS_DICTIONARY) { continue; }
+    if (p.flags & PAGEINFO_FLAGS_DICTIONARY) { continue; }
     printf(
       "P(%lu, s:%d): chunk_row(%d), num_rows(%d), skipped_values(%d), skipped_leaf_values(%d), "
       "str_bytes(%d)\n",
@@ -1039,372 +825,19 @@ void print_pages(cudf::detail::hostdevice_vector<gpu::PageInfo>& pages,
       p.str_bytes);
   }
 }
-
-void print_cumulative_page_info(cudf::detail::hostdevice_vector<gpu::PageInfo>& pages,
-                                rmm::device_uvector<int32_t> const& page_index,
-                                rmm::device_uvector<cumulative_row_info> const& c_info,
-                                rmm::cuda_stream_view stream)
-{
-  pages.device_to_host_sync(stream);
-
-  printf("------------\nCumulative sizes by page\n");
-
-  std::vector<int> schemas(pages.size());
-  std::vector<int> h_page_index(pages.size());
-  CUDF_CUDA_TRY(cudaMemcpy(
-    h_page_index.data(), page_index.data(), sizeof(int) * pages.size(), cudaMemcpyDefault));
-  std::vector<cumulative_row_info> h_cinfo(pages.size());
-  CUDF_CUDA_TRY(cudaMemcpy(
-    h_cinfo.data(), c_info.data(), sizeof(cumulative_row_info) * pages.size(), cudaMemcpyDefault));
-  auto schema_iter = cudf::detail::make_counting_transform_iterator(
-    0, [&](size_type i) { return pages[h_page_index[i]].src_col_schema; });
-  thrust::copy(thrust::seq, schema_iter, schema_iter + pages.size(), schemas.begin());
-  auto last = thrust::unique(thrust::seq, schemas.begin(), schemas.end());
-  schemas.resize(last - schemas.begin());
-  printf("Num schemas: %lu\n", schemas.size());
-
-  for (size_t idx = 0; idx < schemas.size(); idx++) {
-    printf("Schema %d\n", schemas[idx]);
-    for (size_t pidx = 0; pidx < pages.size(); pidx++) {
-      auto const& page = pages[h_page_index[pidx]];
-      if (page.flags & gpu::PAGEINFO_FLAGS_DICTIONARY || page.src_col_schema != schemas[idx]) {
-        continue;
-      }
-      printf("\tP: {%lu, %lu}\n", h_cinfo[pidx].row_count, h_cinfo[pidx].size_bytes);
-    }
-  }
-}
-
-void print_cumulative_row_info(
-  host_span<cumulative_row_info const> sizes,
-  std::string const& label,
-  std::optional<std::vector<gpu::chunk_read_info>> splits = std::nullopt)
-{
-  if (splits.has_value()) {
-    printf("------------\nSplits\n");
-    for (size_t idx = 0; idx < splits->size(); idx++) {
-      printf("{%lu, %lu}\n", splits.value()[idx].skip_rows, splits.value()[idx].num_rows);
-    }
-  }
-
-  printf("------------\nCumulative sizes %s\n", label.c_str());
-  for (size_t idx = 0; idx < sizes.size(); idx++) {
-    printf("{%lu, %lu, %d}", sizes[idx].row_count, sizes[idx].size_bytes, sizes[idx].key);
-    if (splits.has_value()) {
-      // if we have a split at this row count and this is the last instance of this row count
-      auto start = thrust::make_transform_iterator(
-        splits->begin(), [](gpu::chunk_read_info const& i) { return i.skip_rows; });
-      auto end               = start + splits->size();
-      auto split             = std::find(start, end, sizes[idx].row_count);
-      auto const split_index = [&]() -> int {
-        if (split != end &&
-            ((idx == sizes.size() - 1) || (sizes[idx + 1].row_count > sizes[idx].row_count))) {
-          return static_cast<int>(std::distance(start, split));
-        }
-        return idx == 0 ? 0 : -1;
-      }();
-      if (split_index >= 0) {
-        printf(" <-- split {%lu, %lu}",
-               splits.value()[split_index].skip_rows,
-               splits.value()[split_index].num_rows);
-      }
-    }
-    printf("\n");
-  }
-}
 #endif  // PREPROCESS_DEBUG
 
-/**
- * @brief Functor which reduces two cumulative_row_info structs of the same key.
- */
-struct cumulative_row_sum {
-  cumulative_row_info operator()
-    __device__(cumulative_row_info const& a, cumulative_row_info const& b) const
-  {
-    return cumulative_row_info{a.row_count + b.row_count, a.size_bytes + b.size_bytes, a.key};
-  }
-};
-
-/**
- * @brief Functor which computes the total data size for a given type of cudf column.
- *
- * In the case of strings, the return size does not include the chars themselves. That
- * information is tracked separately (see PageInfo::str_bytes).
- */
-struct row_size_functor {
-  __device__ size_t validity_size(size_t num_rows, bool nullable)
-  {
-    return nullable ? (cudf::util::div_rounding_up_safe(num_rows, size_t{32}) * 4) : 0;
-  }
-
-  template <typename T>
-  __device__ size_t operator()(size_t num_rows, bool nullable)
-  {
-    auto const element_size = sizeof(device_storage_type_t<T>);
-    return (element_size * num_rows) + validity_size(num_rows, nullable);
-  }
-};
-
-template <>
-__device__ size_t row_size_functor::operator()<list_view>(size_t num_rows, bool nullable)
-{
-  auto const offset_size = sizeof(size_type);
-  // NOTE: Adding the + 1 offset here isn't strictly correct.  There will only be 1 extra offset
-  // for the entire column, whereas this is adding an extra offset per page.  So we will get a
-  // small over-estimate of the real size of the order :  # of pages * 4 bytes. It seems better
-  // to overestimate size somewhat than to underestimate it and potentially generate chunks
-  // that are too large.
-  return (offset_size * (num_rows + 1)) + validity_size(num_rows, nullable);
-}
-
-template <>
-__device__ size_t row_size_functor::operator()<struct_view>(size_t num_rows, bool nullable)
-{
-  return validity_size(num_rows, nullable);
-}
-
-template <>
-__device__ size_t row_size_functor::operator()<string_view>(size_t num_rows, bool nullable)
-{
-  // only returns the size of offsets and validity. the size of the actual string chars
-  // is tracked separately.
-  auto const offset_size = sizeof(size_type);
-  // see note about offsets in the list_view template.
-  return (offset_size * (num_rows + 1)) + validity_size(num_rows, nullable);
-}
-
-/**
- * @brief Functor which computes the total output cudf data size for all of
- * the data in this page.
- *
- * Sums across all nesting levels.
- */
-struct get_cumulative_row_info {
-  gpu::PageInfo const* const pages;
-
-  __device__ cumulative_row_info operator()(size_type index)
-  {
-    auto const& page = pages[index];
-    if (page.flags & gpu::PAGEINFO_FLAGS_DICTIONARY) {
-      return cumulative_row_info{0, 0, page.src_col_schema};
-    }
-
-    // total nested size, not counting string data
-    auto iter =
-      cudf::detail::make_counting_transform_iterator(0, [page, index] __device__(size_type i) {
-        auto const& pni = page.nesting[i];
-        return cudf::type_dispatcher(
-          data_type{pni.type}, row_size_functor{}, pni.size, pni.nullable);
-      });
-
-    size_t const row_count = static_cast<size_t>(page.nesting[0].size);
-    return {
-      row_count,
-      thrust::reduce(thrust::seq, iter, iter + page.num_output_nesting_levels) + page.str_bytes,
-      page.src_col_schema};
-  }
-};
-
-/**
- * @brief Functor which computes the effective size of all input columns by page.
- *
- * For a given row, we want to find the cost of all pages for all columns involved
- * in loading up to that row.  The complication here is that not all pages are the
- * same size between columns. Example:
- *
- *              page row counts
- * Column A:    0 <----> 100 <----> 200
- * Column B:    0 <---------------> 200 <--------> 400
-                          |
- * if we decide to split at row 100, we don't really know the actual amount of bytes in column B
- * at that point.  So we have to proceed as if we are taking the bytes from all 200 rows of that
- * page. Essentially, a conservative over-estimate of the real size.
- */
-struct row_total_size {
-  cumulative_row_info const* c_info;
-  size_type const* key_offsets;
-  size_t num_keys;
-
-  __device__ cumulative_row_info operator()(cumulative_row_info const& i)
-  {
-    // sum sizes for each input column at this row
-    size_t sum = 0;
-    for (int idx = 0; idx < num_keys; idx++) {
-      auto const start = key_offsets[idx];
-      auto const end   = key_offsets[idx + 1];
-      auto iter        = cudf::detail::make_counting_transform_iterator(
-        0, [&] __device__(size_type i) { return c_info[i].row_count; });
-      auto const page_index =
-        thrust::lower_bound(thrust::seq, iter + start, iter + end, i.row_count) - iter;
-      sum += c_info[page_index].size_bytes;
-    }
-    return {i.row_count, sum, i.key};
-  }
-};
-
-/**
- * @brief Given a vector of cumulative {row_count, byte_size} pairs and a chunk read
- * limit, determine the set of splits.
- *
- * @param sizes Vector of cumulative {row_count, byte_size} pairs
- * @param num_rows Total number of rows to read
- * @param chunk_read_limit Limit on total number of bytes to be returned per read, for all columns
- */
-std::vector<gpu::chunk_read_info> find_splits(std::vector<cumulative_row_info> const& sizes,
-                                              size_t num_rows,
-                                              size_t chunk_read_limit)
-{
-  // now we have an array of {row_count, real output bytes}. just walk through it and generate
-  // splits.
-  // TODO: come up with a clever way to do this entirely in parallel. For now, as long as batch
-  // sizes are reasonably large, this shouldn't iterate too many times
-  std::vector<gpu::chunk_read_info> splits;
-  {
-    size_t cur_pos             = 0;
-    size_t cur_cumulative_size = 0;
-    size_t cur_row_count       = 0;
-    auto start = thrust::make_transform_iterator(sizes.begin(), [&](cumulative_row_info const& i) {
-      return i.size_bytes - cur_cumulative_size;
-    });
-    auto end   = start + sizes.size();
-    while (cur_row_count < num_rows) {
-      int64_t split_pos =
-        thrust::lower_bound(thrust::seq, start + cur_pos, end, chunk_read_limit) - start;
-
-      // if we're past the end, or if the returned bucket is > than the chunk_read_limit, move back
-      // one.
-      if (static_cast<size_t>(split_pos) >= sizes.size() ||
-          (sizes[split_pos].size_bytes - cur_cumulative_size > chunk_read_limit)) {
-        split_pos--;
-      }
-
-      // best-try. if we can't find something that'll fit, we have to go bigger. we're doing this in
-      // a loop because all of the cumulative sizes for all the pages are sorted into one big list.
-      // so if we had two columns, both of which had an entry {1000, 10000}, that entry would be in
-      // the list twice. so we have to iterate until we skip past all of them.  The idea is that we
-      // either do this, or we have to call unique() on the input first.
-      while (split_pos < (static_cast<int64_t>(sizes.size()) - 1) &&
-             (split_pos < 0 || sizes[split_pos].row_count == cur_row_count)) {
-        split_pos++;
-      }
-
-      auto const start_row = cur_row_count;
-      cur_row_count        = sizes[split_pos].row_count;
-      splits.push_back(gpu::chunk_read_info{start_row, cur_row_count - start_row});
-      cur_pos             = split_pos;
-      cur_cumulative_size = sizes[split_pos].size_bytes;
-    }
-  }
-  // print_cumulative_row_info(sizes, "adjusted", splits);
-
-  return splits;
-}
-
-/**
- * @brief Given a set of pages that have had their sizes computed by nesting level and
- * a limit on total read size, generate a set of {skip_rows, num_rows} pairs representing
- * a set of reads that will generate output columns of total size <= `chunk_read_limit` bytes.
- *
- * @param pages All pages in the file
- * @param id Additional intermediate information required to process the pages
- * @param num_rows Total number of rows to read
- * @param chunk_read_limit Limit on total number of bytes to be returned per read, for all columns
- * @param stream CUDA stream to use
- */
-std::vector<gpu::chunk_read_info> compute_splits(
-  cudf::detail::hostdevice_vector<gpu::PageInfo>& pages,
-  gpu::pass_intermediate_data const& id,
-  size_t num_rows,
-  size_t chunk_read_limit,
-  rmm::cuda_stream_view stream)
-{
-  auto const& page_keys  = id.page_keys;
-  auto const& page_index = id.page_index;
-
-  // generate cumulative row counts and sizes
-  rmm::device_uvector<cumulative_row_info> c_info(page_keys.size(), stream);
-  // convert PageInfo to cumulative_row_info
-  auto page_input = thrust::make_transform_iterator(page_index.begin(),
-                                                    get_cumulative_row_info{pages.device_ptr()});
-  thrust::inclusive_scan_by_key(rmm::exec_policy(stream),
-                                page_keys.begin(),
-                                page_keys.end(),
-                                page_input,
-                                c_info.begin(),
-                                thrust::equal_to{},
-                                cumulative_row_sum{});
-  // print_cumulative_page_info(pages, page_index, c_info, stream);
-
-  // sort by row count
-  rmm::device_uvector<cumulative_row_info> c_info_sorted{c_info, stream};
-  thrust::sort(rmm::exec_policy(stream),
-               c_info_sorted.begin(),
-               c_info_sorted.end(),
-               [] __device__(cumulative_row_info const& a, cumulative_row_info const& b) {
-                 return a.row_count < b.row_count;
-               });
-
-  // std::vector<cumulative_row_info> h_c_info_sorted(c_info_sorted.size());
-  // CUDF_CUDA_TRY(cudaMemcpy(h_c_info_sorted.data(),
-  //                          c_info_sorted.data(),
-  //                          sizeof(cumulative_row_info) * c_info_sorted.size(),
-  //                          cudaMemcpyDefault));
-  // print_cumulative_row_info(h_c_info_sorted, "raw");
-
-  // generate key offsets (offsets to the start of each partition of keys). worst case is 1 page per
-  // key
-  rmm::device_uvector<size_type> key_offsets(page_keys.size() + 1, stream);
-  auto const key_offsets_end = thrust::reduce_by_key(rmm::exec_policy(stream),
-                                                     page_keys.begin(),
-                                                     page_keys.end(),
-                                                     thrust::make_constant_iterator(1),
-                                                     thrust::make_discard_iterator(),
-                                                     key_offsets.begin())
-                                 .second;
-  size_t const num_unique_keys = key_offsets_end - key_offsets.begin();
-  thrust::exclusive_scan(
-    rmm::exec_policy(stream), key_offsets.begin(), key_offsets.end(), key_offsets.begin());
-
-  // adjust the cumulative info such that for each row count, the size includes any pages that span
-  // that row count. this is so that if we have this case:
-  //              page row counts
-  // Column A:    0 <----> 100 <----> 200
-  // Column B:    0 <---------------> 200 <--------> 400
-  //                        |
-  // if we decide to split at row 100, we don't really know the actual amount of bytes in column B
-  // at that point.  So we have to proceed as if we are taking the bytes from all 200 rows of that
-  // page.
-  //
-  rmm::device_uvector<cumulative_row_info> aggregated_info(c_info.size(), stream);
-  thrust::transform(rmm::exec_policy(stream),
-                    c_info_sorted.begin(),
-                    c_info_sorted.end(),
-                    aggregated_info.begin(),
-                    row_total_size{c_info.data(), key_offsets.data(), num_unique_keys});
-
-  // bring back to the cpu
-  std::vector<cumulative_row_info> h_aggregated_info(aggregated_info.size());
-  CUDF_CUDA_TRY(cudaMemcpyAsync(h_aggregated_info.data(),
-                                aggregated_info.data(),
-                                sizeof(cumulative_row_info) * c_info.size(),
-                                cudaMemcpyDefault,
-                                stream.value()));
-  stream.synchronize();
-
-  return find_splits(h_aggregated_info, num_rows, chunk_read_limit);
-}
-
 struct get_page_chunk_idx {
-  __device__ size_type operator()(gpu::PageInfo const& page) { return page.chunk_idx; }
+  __device__ size_type operator()(PageInfo const& page) { return page.chunk_idx; }
 };
 
 struct get_page_num_rows {
-  __device__ size_type operator()(gpu::PageInfo const& page) { return page.num_rows; }
+  __device__ size_type operator()(PageInfo const& page) { return page.num_rows; }
 };
 
 struct get_page_column_index {
-  gpu::ColumnChunkDesc const* chunks;
-  __device__ size_type operator()(gpu::PageInfo const& page)
+  ColumnChunkDesc const* chunks;
+  __device__ size_type operator()(PageInfo const& page)
   {
     return chunks[page.chunk_idx].src_col_index;
   }
@@ -1441,7 +874,7 @@ struct get_page_nesting_size {
   input_col_info const* const input_cols;
   size_type const max_depth;
   size_t const num_pages;
-  gpu::PageInfo const* const pages;
+  PageInfo const* const pages;
   int const* page_indices;
 
   __device__ size_type operator()(size_t index) const
@@ -1450,7 +883,7 @@ struct get_page_nesting_size {
 
     auto const& page = pages[page_indices[indices.page_idx]];
     if (page.src_col_schema != input_cols[indices.col_idx].schema_idx ||
-        page.flags & gpu::PAGEINFO_FLAGS_DICTIONARY ||
+        page.flags & PAGEINFO_FLAGS_DICTIONARY ||
         indices.depth_idx >= input_cols[indices.col_idx].nesting_depth) {
       return 0;
     }
@@ -1468,7 +901,7 @@ struct get_reduction_key {
  * @brief Writes to the chunk_row field of the PageInfo struct.
  */
 struct chunk_row_output_iter {
-  gpu::PageInfo* p;
+  PageInfo* p;
   using value_type        = size_type;
   using difference_type   = size_type;
   using pointer           = size_type*;
@@ -1490,7 +923,7 @@ struct chunk_row_output_iter {
  * @brief Writes to the page_start_value field of the PageNestingInfo struct, keyed by schema.
  */
 struct start_offset_output_iterator {
-  gpu::PageInfo const* pages;
+  PageInfo const* pages;
   int const* page_indices;
   size_t cur_index;
   input_col_info const* input_cols;
@@ -1529,9 +962,9 @@ struct start_offset_output_iterator {
   {
     auto const indices = reduction_indices{index, max_depth, num_pages};
 
-    gpu::PageInfo const& p = pages[page_indices[indices.page_idx]];
+    PageInfo const& p = pages[page_indices[indices.page_idx]];
     if (p.src_col_schema != input_cols[indices.col_idx].schema_idx ||
-        p.flags & gpu::PAGEINFO_FLAGS_DICTIONARY ||
+        p.flags & PAGEINFO_FLAGS_DICTIONARY ||
         indices.depth_idx >= input_cols[indices.col_idx].nesting_depth) {
       return empty;
     }
@@ -1540,15 +973,15 @@ struct start_offset_output_iterator {
 };
 
 struct flat_column_num_rows {
-  gpu::PageInfo const* pages;
-  gpu::ColumnChunkDesc const* chunks;
+  PageInfo const* pages;
+  ColumnChunkDesc const* chunks;
 
   __device__ size_type operator()(size_type pindex) const
   {
-    gpu::PageInfo const& page = pages[pindex];
+    PageInfo const& page = pages[pindex];
     // ignore dictionary pages and pages belonging to any column containing repetition (lists)
-    if ((page.flags & gpu::PAGEINFO_FLAGS_DICTIONARY) ||
-        (chunks[page.chunk_idx].max_level[gpu::level_type::REPETITION] > 0)) {
+    if ((page.flags & PAGEINFO_FLAGS_DICTIONARY) ||
+        (chunks[page.chunk_idx].max_level[level_type::REPETITION] > 0)) {
       return 0;
     }
     return page.num_rows;
@@ -1581,8 +1014,8 @@ struct row_counts_different {
  * @param expected_row_count Expected row count, if applicable
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
-void detect_malformed_pages(cudf::detail::hostdevice_vector<gpu::PageInfo>& pages,
-                            cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc> const& chunks,
+void detect_malformed_pages(cudf::detail::hostdevice_vector<PageInfo>& pages,
+                            cudf::detail::hostdevice_vector<ColumnChunkDesc> const& chunks,
                             device_span<int const> page_keys,
                             device_span<int const> page_index,
                             std::optional<size_t> expected_row_count,
@@ -1631,23 +1064,21 @@ void detect_malformed_pages(cudf::detail::hostdevice_vector<gpu::PageInfo>& page
 }
 
 struct page_to_string_size {
-  gpu::PageInfo* pages;
-  gpu::ColumnChunkDesc const* chunks;
+  PageInfo* pages;
+  ColumnChunkDesc const* chunks;
 
   __device__ size_t operator()(size_type page_idx) const
   {
     auto const page  = pages[page_idx];
     auto const chunk = chunks[page.chunk_idx];
 
-    if (not is_string_col(chunk) || (page.flags & gpu::PAGEINFO_FLAGS_DICTIONARY) != 0) {
-      return 0;
-    }
+    if (not is_string_col(chunk) || (page.flags & PAGEINFO_FLAGS_DICTIONARY) != 0) { return 0; }
     return pages[page_idx].str_bytes;
   }
 };
 
 struct page_offset_output_iter {
-  gpu::PageInfo* p;
+  PageInfo* p;
   size_type const* index;
 
   using value_type        = size_type;
@@ -1738,7 +1169,7 @@ void reader::impl::preprocess_pages(bool uses_custom_row_bounds, size_t chunk_re
       cols          = &out_buf.children;
 
       // if this has a list parent, we have to get column sizes from the
-      // data computed during gpu::ComputePageSizes
+      // data computed during ComputePageSizes
       if (out_buf.user_data & PARQUET_COLUMN_BUFFER_FLAG_HAS_LIST_PARENT) {
         has_lists = true;
         break;
@@ -1749,7 +1180,7 @@ void reader::impl::preprocess_pages(bool uses_custom_row_bounds, size_t chunk_re
 
   // generate string dict indices if necessary
   {
-    auto is_dict_chunk = [](gpu::ColumnChunkDesc const& chunk) {
+    auto is_dict_chunk = [](ColumnChunkDesc const& chunk) {
       return (chunk.data_type & 0x7) == BYTE_ARRAY && chunk.num_dict_pages > 0;
     };
 
@@ -1785,7 +1216,7 @@ void reader::impl::preprocess_pages(bool uses_custom_row_bounds, size_t chunk_re
 
     if (total_str_dict_indexes > 0) {
       chunks.host_to_device_async(_stream);
-      gpu::BuildStringDictionaryIndex(chunks.device_ptr(), chunks.size(), _stream);
+      BuildStringDictionaryIndex(chunks.device_ptr(), chunks.size(), _stream);
     }
   }
 
@@ -1800,14 +1231,14 @@ void reader::impl::preprocess_pages(bool uses_custom_row_bounds, size_t chunk_re
     // if:
     // - user has passed custom row bounds
     // - we will be doing a chunked read
-    gpu::ComputePageSizes(pages,
-                          chunks,
-                          0,  // 0-max size_t. process all possible rows
-                          std::numeric_limits<size_t>::max(),
-                          true,                  // compute num_rows
-                          chunk_read_limit > 0,  // compute string sizes
-                          _pass_itm_data->level_type_size,
-                          _stream);
+    ComputePageSizes(pages,
+                     chunks,
+                     0,  // 0-max size_t. process all possible rows
+                     std::numeric_limits<size_t>::max(),
+                     true,                  // compute num_rows
+                     chunk_read_limit > 0,  // compute string sizes
+                     _pass_itm_data->level_type_size,
+                     _stream);
 
     // computes:
     // PageInfo::chunk_row (the absolute start row index) for all pages
@@ -1831,12 +1262,8 @@ void reader::impl::preprocess_pages(bool uses_custom_row_bounds, size_t chunk_re
   _pass_itm_data->page_keys  = std::move(page_keys);
   _pass_itm_data->page_index = std::move(page_index);
 
-  // compute splits if necessary. otherwise return a single split representing
-  // the whole file.
-  _pass_itm_data->output_chunk_read_info =
-    _output_chunk_read_limit > 0
-      ? compute_splits(pages, *_pass_itm_data, num_rows, chunk_read_limit, _stream)
-      : std::vector<gpu::chunk_read_info>{{skip_rows, num_rows}};
+  // compute splits for the pass
+  compute_splits_for_pass();
 }
 
 void reader::impl::allocate_columns(size_t skip_rows, size_t num_rows, bool uses_custom_row_bounds)
@@ -1853,14 +1280,14 @@ void reader::impl::allocate_columns(size_t skip_rows, size_t num_rows, bool uses
   // respect the user bounds. It is only necessary to do this second pass if uses_custom_row_bounds
   // is set (if the user has specified artificial bounds).
   if (uses_custom_row_bounds) {
-    gpu::ComputePageSizes(pages,
-                          chunks,
-                          skip_rows,
-                          num_rows,
-                          false,  // num_rows is already computed
-                          false,  // no need to compute string sizes
-                          _pass_itm_data->level_type_size,
-                          _stream);
+    ComputePageSizes(pages,
+                     chunks,
+                     skip_rows,
+                     num_rows,
+                     false,  // num_rows is already computed
+                     false,  // no need to compute string sizes
+                     _pass_itm_data->level_type_size,
+                     _stream);
 
     // print_pages(pages, _stream);
   }
@@ -1879,7 +1306,7 @@ void reader::impl::allocate_columns(size_t skip_rows, size_t num_rows, bool uses
       cols          = &out_buf.children;
 
       // if this has a list parent, we have to get column sizes from the
-      // data computed during gpu::ComputePageSizes
+      // data computed during ComputePageSizes
       if (out_buf.user_data & PARQUET_COLUMN_BUFFER_FLAG_HAS_LIST_PARENT) {
         has_lists = true;
       }
@@ -1989,7 +1416,7 @@ std::vector<size_t> reader::impl::calculate_page_string_offsets()
     page_index.begin(), page_to_string_size{pages.device_ptr(), chunks.device_ptr()});
 
   // do scan by key to calculate string offsets for each page
-  thrust::exclusive_scan_by_key(rmm::exec_policy(_stream),
+  thrust::exclusive_scan_by_key(rmm::exec_policy_nosync(_stream),
                                 page_keys.begin(),
                                 page_keys.end(),
                                 val_iter,
@@ -1997,7 +1424,7 @@ std::vector<size_t> reader::impl::calculate_page_string_offsets()
 
   // now sum up page sizes
   rmm::device_uvector<int> reduce_keys(col_sizes.size(), _stream);
-  thrust::reduce_by_key(rmm::exec_policy(_stream),
+  thrust::reduce_by_key(rmm::exec_policy_nosync(_stream),
                         page_keys.begin(),
                         page_keys.end(),
                         val_iter,
@@ -2014,4 +1441,4 @@ std::vector<size_t> reader::impl::calculate_page_string_offsets()
   return col_sizes;
 }
 
-}  // namespace cudf::io::detail::parquet
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/rle_stream.cuh b/cpp/src/io/parquet/rle_stream.cuh
index 2545a074a38..799d6d9fd64 100644
--- a/cpp/src/io/parquet/rle_stream.cuh
+++ b/cpp/src/io/parquet/rle_stream.cuh
@@ -20,7 +20,7 @@
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/integer_utils.hpp>
 
-namespace cudf::io::parquet::gpu {
+namespace cudf::io::parquet::detail {
 
 template <int num_threads>
 constexpr int rle_stream_required_run_buffer_size()
@@ -362,4 +362,4 @@ struct rle_stream {
   }
 };
 
-}  // namespace cudf::io::parquet::gpu
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index a124f352ee4..c2b10e09b1a 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -54,12 +54,9 @@
 #include <numeric>
 #include <utility>
 
-namespace cudf {
-namespace io {
-namespace detail {
-namespace parquet {
-using namespace cudf::io::parquet;
-using namespace cudf::io;
+namespace cudf::io::parquet::detail {
+
+using namespace cudf::io::detail;
 
 struct aggregate_writer_metadata {
   aggregate_writer_metadata(host_span<partition_info const> partitions,
@@ -185,13 +182,13 @@ namespace {
  * @param compression The compression type
  * @return The supported Parquet compression
  */
-parquet::Compression to_parquet_compression(compression_type compression)
+Compression to_parquet_compression(compression_type compression)
 {
   switch (compression) {
     case compression_type::AUTO:
-    case compression_type::SNAPPY: return parquet::Compression::SNAPPY;
-    case compression_type::ZSTD: return parquet::Compression::ZSTD;
-    case compression_type::NONE: return parquet::Compression::UNCOMPRESSED;
+    case compression_type::SNAPPY: return Compression::SNAPPY;
+    case compression_type::ZSTD: return Compression::ZSTD;
+    case compression_type::NONE: return Compression::UNCOMPRESSED;
     default: CUDF_FAIL("Unsupported compression type");
   }
 }
@@ -206,7 +203,7 @@ void update_chunk_encodings(std::vector<Encoding>& encodings, uint32_t enc_mask)
 {
   for (uint8_t enc = 0; enc < static_cast<uint8_t>(Encoding::NUM_ENCODINGS); enc++) {
     auto const enc_enum = static_cast<Encoding>(enc);
-    if ((enc_mask & gpu::encoding_to_mask(enc_enum)) != 0) { encodings.push_back(enc_enum); }
+    if ((enc_mask & encoding_to_mask(enc_enum)) != 0) { encodings.push_back(enc_enum); }
   }
 }
 
@@ -281,12 +278,14 @@ struct leaf_schema_fn {
   cudf::detail::LinkedColPtr const& col;
   column_in_metadata const& col_meta;
   bool timestamp_is_int96;
+  bool timestamp_is_utc;
 
   template <typename T>
   std::enable_if_t<std::is_same_v<T, bool>, void> operator()()
   {
     col_schema.type        = Type::BOOLEAN;
     col_schema.stats_dtype = statistics_dtype::dtype_bool;
+    // BOOLEAN needs no converted or logical type
   }
 
   template <typename T>
@@ -295,6 +294,7 @@ struct leaf_schema_fn {
     col_schema.type           = Type::INT32;
     col_schema.converted_type = ConvertedType::INT_8;
     col_schema.stats_dtype    = statistics_dtype::dtype_int8;
+    col_schema.logical_type   = LogicalType{IntType{8, true}};
   }
 
   template <typename T>
@@ -303,6 +303,7 @@ struct leaf_schema_fn {
     col_schema.type           = Type::INT32;
     col_schema.converted_type = ConvertedType::INT_16;
     col_schema.stats_dtype    = statistics_dtype::dtype_int16;
+    col_schema.logical_type   = LogicalType{IntType{16, true}};
   }
 
   template <typename T>
@@ -310,6 +311,7 @@ struct leaf_schema_fn {
   {
     col_schema.type        = Type::INT32;
     col_schema.stats_dtype = statistics_dtype::dtype_int32;
+    // INT32 needs no converted or logical type
   }
 
   template <typename T>
@@ -317,6 +319,7 @@ struct leaf_schema_fn {
   {
     col_schema.type        = Type::INT64;
     col_schema.stats_dtype = statistics_dtype::dtype_int64;
+    // INT64 needs no converted or logical type
   }
 
   template <typename T>
@@ -325,6 +328,7 @@ struct leaf_schema_fn {
     col_schema.type           = Type::INT32;
     col_schema.converted_type = ConvertedType::UINT_8;
     col_schema.stats_dtype    = statistics_dtype::dtype_int8;
+    col_schema.logical_type   = LogicalType{IntType{8, false}};
   }
 
   template <typename T>
@@ -333,6 +337,7 @@ struct leaf_schema_fn {
     col_schema.type           = Type::INT32;
     col_schema.converted_type = ConvertedType::UINT_16;
     col_schema.stats_dtype    = statistics_dtype::dtype_int16;
+    col_schema.logical_type   = LogicalType{IntType{16, false}};
   }
 
   template <typename T>
@@ -341,6 +346,7 @@ struct leaf_schema_fn {
     col_schema.type           = Type::INT32;
     col_schema.converted_type = ConvertedType::UINT_32;
     col_schema.stats_dtype    = statistics_dtype::dtype_int32;
+    col_schema.logical_type   = LogicalType{IntType{32, false}};
   }
 
   template <typename T>
@@ -349,6 +355,7 @@ struct leaf_schema_fn {
     col_schema.type           = Type::INT64;
     col_schema.converted_type = ConvertedType::UINT_64;
     col_schema.stats_dtype    = statistics_dtype::dtype_int64;
+    col_schema.logical_type   = LogicalType{IntType{64, false}};
   }
 
   template <typename T>
@@ -356,6 +363,7 @@ struct leaf_schema_fn {
   {
     col_schema.type        = Type::FLOAT;
     col_schema.stats_dtype = statistics_dtype::dtype_float32;
+    // FLOAT needs no converted or logical type
   }
 
   template <typename T>
@@ -363,6 +371,7 @@ struct leaf_schema_fn {
   {
     col_schema.type        = Type::DOUBLE;
     col_schema.stats_dtype = statistics_dtype::dtype_float64;
+    // DOUBLE needs no converted or logical type
   }
 
   template <typename T>
@@ -370,11 +379,12 @@ struct leaf_schema_fn {
   {
     col_schema.type = Type::BYTE_ARRAY;
     if (col_meta.is_enabled_output_as_binary()) {
-      col_schema.converted_type = ConvertedType::UNKNOWN;
-      col_schema.stats_dtype    = statistics_dtype::dtype_byte_array;
+      col_schema.stats_dtype = statistics_dtype::dtype_byte_array;
+      // BYTE_ARRAY needs no converted or logical type
     } else {
       col_schema.converted_type = ConvertedType::UTF8;
       col_schema.stats_dtype    = statistics_dtype::dtype_string;
+      col_schema.logical_type   = LogicalType{LogicalType::STRING};
     }
   }
 
@@ -384,49 +394,55 @@ struct leaf_schema_fn {
     col_schema.type           = Type::INT32;
     col_schema.converted_type = ConvertedType::DATE;
     col_schema.stats_dtype    = statistics_dtype::dtype_int32;
+    col_schema.logical_type   = LogicalType{LogicalType::DATE};
   }
 
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::timestamp_s>, void> operator()()
   {
-    col_schema.type = (timestamp_is_int96) ? Type::INT96 : Type::INT64;
-    col_schema.converted_type =
-      (timestamp_is_int96) ? ConvertedType::UNKNOWN : ConvertedType::TIMESTAMP_MILLIS;
+    col_schema.type        = (timestamp_is_int96) ? Type::INT96 : Type::INT64;
     col_schema.stats_dtype = statistics_dtype::dtype_timestamp64;
     col_schema.ts_scale    = 1000;
+    if (not timestamp_is_int96) {
+      col_schema.converted_type = ConvertedType::TIMESTAMP_MILLIS;
+      col_schema.logical_type   = LogicalType{TimestampType{timestamp_is_utc, TimeUnit::MILLIS}};
+    }
   }
 
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::timestamp_ms>, void> operator()()
   {
-    col_schema.type = (timestamp_is_int96) ? Type::INT96 : Type::INT64;
-    col_schema.converted_type =
-      (timestamp_is_int96) ? ConvertedType::UNKNOWN : ConvertedType::TIMESTAMP_MILLIS;
+    col_schema.type        = (timestamp_is_int96) ? Type::INT96 : Type::INT64;
     col_schema.stats_dtype = statistics_dtype::dtype_timestamp64;
+    if (not timestamp_is_int96) {
+      col_schema.converted_type = ConvertedType::TIMESTAMP_MILLIS;
+      col_schema.logical_type   = LogicalType{TimestampType{timestamp_is_utc, TimeUnit::MILLIS}};
+    }
   }
 
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::timestamp_us>, void> operator()()
   {
-    col_schema.type = (timestamp_is_int96) ? Type::INT96 : Type::INT64;
-    col_schema.converted_type =
-      (timestamp_is_int96) ? ConvertedType::UNKNOWN : ConvertedType::TIMESTAMP_MICROS;
+    col_schema.type        = (timestamp_is_int96) ? Type::INT96 : Type::INT64;
     col_schema.stats_dtype = statistics_dtype::dtype_timestamp64;
+    if (not timestamp_is_int96) {
+      col_schema.converted_type = ConvertedType::TIMESTAMP_MICROS;
+      col_schema.logical_type   = LogicalType{TimestampType{timestamp_is_utc, TimeUnit::MICROS}};
+    }
   }
 
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::timestamp_ns>, void> operator()()
   {
     col_schema.type           = (timestamp_is_int96) ? Type::INT96 : Type::INT64;
-    col_schema.converted_type = ConvertedType::UNKNOWN;
+    col_schema.converted_type = thrust::nullopt;
     col_schema.stats_dtype    = statistics_dtype::dtype_timestamp64;
     if (timestamp_is_int96) {
       col_schema.ts_scale = -1000;  // negative value indicates division by absolute value
     }
     // set logical type if it's not int96
     else {
-      col_schema.logical_type.isset.TIMESTAMP            = true;
-      col_schema.logical_type.TIMESTAMP.unit.isset.NANOS = true;
+      col_schema.logical_type = LogicalType{TimestampType{timestamp_is_utc, TimeUnit::NANOS}};
     }
   }
 
@@ -434,53 +450,48 @@ struct leaf_schema_fn {
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::duration_D>, void> operator()()
   {
-    col_schema.type                                = Type::INT32;
-    col_schema.converted_type                      = ConvertedType::TIME_MILLIS;
-    col_schema.stats_dtype                         = statistics_dtype::dtype_int32;
-    col_schema.ts_scale                            = 24 * 60 * 60 * 1000;
-    col_schema.logical_type.isset.TIME             = true;
-    col_schema.logical_type.TIME.unit.isset.MILLIS = true;
+    col_schema.type           = Type::INT32;
+    col_schema.converted_type = ConvertedType::TIME_MILLIS;
+    col_schema.stats_dtype    = statistics_dtype::dtype_int32;
+    col_schema.ts_scale       = 24 * 60 * 60 * 1000;
+    col_schema.logical_type   = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MILLIS}};
   }
 
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::duration_s>, void> operator()()
   {
-    col_schema.type                                = Type::INT32;
-    col_schema.converted_type                      = ConvertedType::TIME_MILLIS;
-    col_schema.stats_dtype                         = statistics_dtype::dtype_int32;
-    col_schema.ts_scale                            = 1000;
-    col_schema.logical_type.isset.TIME             = true;
-    col_schema.logical_type.TIME.unit.isset.MILLIS = true;
+    col_schema.type           = Type::INT32;
+    col_schema.converted_type = ConvertedType::TIME_MILLIS;
+    col_schema.stats_dtype    = statistics_dtype::dtype_int32;
+    col_schema.ts_scale       = 1000;
+    col_schema.logical_type   = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MILLIS}};
   }
 
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::duration_ms>, void> operator()()
   {
-    col_schema.type                                = Type::INT32;
-    col_schema.converted_type                      = ConvertedType::TIME_MILLIS;
-    col_schema.stats_dtype                         = statistics_dtype::dtype_int32;
-    col_schema.logical_type.isset.TIME             = true;
-    col_schema.logical_type.TIME.unit.isset.MILLIS = true;
+    col_schema.type           = Type::INT32;
+    col_schema.converted_type = ConvertedType::TIME_MILLIS;
+    col_schema.stats_dtype    = statistics_dtype::dtype_int32;
+    col_schema.logical_type   = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MILLIS}};
   }
 
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::duration_us>, void> operator()()
   {
-    col_schema.type                                = Type::INT64;
-    col_schema.converted_type                      = ConvertedType::TIME_MICROS;
-    col_schema.stats_dtype                         = statistics_dtype::dtype_int64;
-    col_schema.logical_type.isset.TIME             = true;
-    col_schema.logical_type.TIME.unit.isset.MICROS = true;
+    col_schema.type           = Type::INT64;
+    col_schema.converted_type = ConvertedType::TIME_MICROS;
+    col_schema.stats_dtype    = statistics_dtype::dtype_int64;
+    col_schema.logical_type   = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MICROS}};
   }
 
   //  unsupported outside cudf for parquet 1.0.
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::duration_ns>, void> operator()()
   {
-    col_schema.type                               = Type::INT64;
-    col_schema.stats_dtype                        = statistics_dtype::dtype_int64;
-    col_schema.logical_type.isset.TIME            = true;
-    col_schema.logical_type.TIME.unit.isset.NANOS = true;
+    col_schema.type         = Type::INT64;
+    col_schema.stats_dtype  = statistics_dtype::dtype_int64;
+    col_schema.logical_type = LogicalType{TimeType{timestamp_is_utc, TimeUnit::NANOS}};
   }
 
   template <typename T>
@@ -490,27 +501,32 @@ struct leaf_schema_fn {
       col_schema.type              = Type::INT32;
       col_schema.stats_dtype       = statistics_dtype::dtype_int32;
       col_schema.decimal_precision = MAX_DECIMAL32_PRECISION;
+      col_schema.logical_type      = LogicalType{DecimalType{0, MAX_DECIMAL32_PRECISION}};
     } else if (std::is_same_v<T, numeric::decimal64>) {
       col_schema.type              = Type::INT64;
       col_schema.stats_dtype       = statistics_dtype::dtype_decimal64;
       col_schema.decimal_precision = MAX_DECIMAL64_PRECISION;
+      col_schema.logical_type      = LogicalType{DecimalType{0, MAX_DECIMAL64_PRECISION}};
     } else if (std::is_same_v<T, numeric::decimal128>) {
       col_schema.type              = Type::FIXED_LEN_BYTE_ARRAY;
       col_schema.type_length       = sizeof(__int128_t);
       col_schema.stats_dtype       = statistics_dtype::dtype_decimal128;
       col_schema.decimal_precision = MAX_DECIMAL128_PRECISION;
+      col_schema.logical_type      = LogicalType{DecimalType{0, MAX_DECIMAL128_PRECISION}};
     } else {
       CUDF_FAIL("Unsupported fixed point type for parquet writer");
     }
     col_schema.converted_type = ConvertedType::DECIMAL;
     col_schema.decimal_scale = -col->type().scale();  // parquet and cudf disagree about scale signs
+    col_schema.logical_type->decimal_type->scale = -col->type().scale();
     if (col_meta.is_decimal_precision_set()) {
       CUDF_EXPECTS(col_meta.get_decimal_precision() >= col_schema.decimal_scale,
                    "Precision must be equal to or greater than scale!");
       if (col_schema.type == Type::INT64 and col_meta.get_decimal_precision() < 10) {
         CUDF_LOG_WARN("Parquet writer: writing a decimal column with precision < 10 as int64");
       }
-      col_schema.decimal_precision = col_meta.get_decimal_precision();
+      col_schema.decimal_precision                     = col_meta.get_decimal_precision();
+      col_schema.logical_type->decimal_type->precision = col_meta.get_decimal_precision();
     }
   }
 
@@ -552,7 +568,8 @@ std::vector<schema_tree_node> construct_schema_tree(
   cudf::detail::LinkedColVector const& linked_columns,
   table_input_metadata& metadata,
   single_write_mode write_mode,
-  bool int96_timestamps)
+  bool int96_timestamps,
+  bool utc_timestamps)
 {
   std::vector<schema_tree_node> schema;
   schema_tree_node root{};
@@ -596,7 +613,7 @@ std::vector<schema_tree_node> construct_schema_tree(
 
         schema_tree_node col_schema{};
         col_schema.type            = Type::BYTE_ARRAY;
-        col_schema.converted_type  = ConvertedType::UNKNOWN;
+        col_schema.converted_type  = thrust::nullopt;
         col_schema.stats_dtype     = statistics_dtype::dtype_byte_array;
         col_schema.repetition_type = col_nullable ? OPTIONAL : REQUIRED;
         col_schema.name = (schema[parent_idx].name == "list") ? "element" : col_meta.get_name();
@@ -724,8 +741,9 @@ std::vector<schema_tree_node> construct_schema_tree(
 
         bool timestamp_is_int96 = int96_timestamps or col_meta.is_enabled_int96_timestamps();
 
-        cudf::type_dispatcher(col->type(),
-                              leaf_schema_fn{col_schema, col, col_meta, timestamp_is_int96});
+        cudf::type_dispatcher(
+          col->type(),
+          leaf_schema_fn{col_schema, col, col_meta, timestamp_is_int96, utc_timestamps});
 
         col_schema.repetition_type = col_nullable ? OPTIONAL : REQUIRED;
         col_schema.name = (schema[parent_idx].name == "list") ? "element" : col_meta.get_name();
@@ -761,11 +779,14 @@ struct parquet_column_view {
                       std::vector<schema_tree_node> const& schema_tree,
                       rmm::cuda_stream_view stream);
 
-  [[nodiscard]] gpu::parquet_column_device_view get_device_view(rmm::cuda_stream_view stream) const;
+  [[nodiscard]] parquet_column_device_view get_device_view(rmm::cuda_stream_view stream) const;
 
   [[nodiscard]] column_view cudf_column_view() const { return cudf_col; }
-  [[nodiscard]] parquet::Type physical_type() const { return schema_node.type; }
-  [[nodiscard]] parquet::ConvertedType converted_type() const { return schema_node.converted_type; }
+  [[nodiscard]] Type physical_type() const { return schema_node.type; }
+  [[nodiscard]] ConvertedType converted_type() const
+  {
+    return schema_node.converted_type.value_or(UNKNOWN);
+  }
 
   std::vector<std::string> const& get_path_in_schema() { return path_in_schema; }
 
@@ -846,11 +867,11 @@ parquet_column_view::parquet_column_view(schema_tree_node const& schema_node,
   uint16_t max_rep_level = 0;
   curr_schema_node       = schema_node;
   while (curr_schema_node.parent_idx != -1) {
-    if (curr_schema_node.repetition_type == parquet::REPEATED or
-        curr_schema_node.repetition_type == parquet::OPTIONAL) {
+    if (curr_schema_node.repetition_type == REPEATED or
+        curr_schema_node.repetition_type == OPTIONAL) {
       ++max_def_level;
     }
-    if (curr_schema_node.repetition_type == parquet::REPEATED) { ++max_rep_level; }
+    if (curr_schema_node.repetition_type == REPEATED) { ++max_rep_level; }
     curr_schema_node = schema_tree[curr_schema_node.parent_idx];
   }
   CUDF_EXPECTS(max_def_level < 256, "Definition levels above 255 are not supported");
@@ -897,9 +918,9 @@ parquet_column_view::parquet_column_view(schema_tree_node const& schema_node,
   }
 }
 
-gpu::parquet_column_device_view parquet_column_view::get_device_view(rmm::cuda_stream_view) const
+parquet_column_device_view parquet_column_view::get_device_view(rmm::cuda_stream_view) const
 {
-  auto desc        = gpu::parquet_column_device_view{};  // Zero out all fields
+  auto desc        = parquet_column_device_view{};  // Zero out all fields
   desc.stats_dtype = schema_node.stats_dtype;
   desc.ts_scale    = schema_node.ts_scale;
 
@@ -931,8 +952,8 @@ gpu::parquet_column_device_view parquet_column_view::get_device_view(rmm::cuda_s
  * @param fragment_size Number of rows per fragment
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
-void init_row_group_fragments(cudf::detail::hostdevice_2dvector<gpu::PageFragment>& frag,
-                              device_span<gpu::parquet_column_device_view const> col_desc,
+void init_row_group_fragments(cudf::detail::hostdevice_2dvector<PageFragment>& frag,
+                              device_span<parquet_column_device_view const> col_desc,
                               host_span<partition_info const> partitions,
                               device_span<int const> part_frag_offset,
                               uint32_t fragment_size,
@@ -940,7 +961,7 @@ void init_row_group_fragments(cudf::detail::hostdevice_2dvector<gpu::PageFragmen
 {
   auto d_partitions = cudf::detail::make_device_uvector_async(
     partitions, stream, rmm::mr::get_current_device_resource());
-  gpu::InitRowGroupFragments(frag, col_desc, d_partitions, part_frag_offset, fragment_size, stream);
+  InitRowGroupFragments(frag, col_desc, d_partitions, part_frag_offset, fragment_size, stream);
   frag.device_to_host_sync(stream);
 }
 
@@ -954,13 +975,13 @@ void init_row_group_fragments(cudf::detail::hostdevice_2dvector<gpu::PageFragmen
  * @param frag_sizes Array of fragment sizes for each column
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
-void calculate_page_fragments(device_span<gpu::PageFragment> frag,
+void calculate_page_fragments(device_span<PageFragment> frag,
                               host_span<size_type const> frag_sizes,
                               rmm::cuda_stream_view stream)
 {
   auto d_frag_sz = cudf::detail::make_device_uvector_async(
     frag_sizes, stream, rmm::mr::get_current_device_resource());
-  gpu::CalculatePageFragments(frag, d_frag_sz, stream);
+  CalculatePageFragments(frag, d_frag_sz, stream);
 }
 
 /**
@@ -972,13 +993,13 @@ void calculate_page_fragments(device_span<gpu::PageFragment> frag,
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
 void gather_fragment_statistics(device_span<statistics_chunk> frag_stats,
-                                device_span<gpu::PageFragment const> frags,
+                                device_span<PageFragment const> frags,
                                 bool int96_timestamps,
                                 rmm::cuda_stream_view stream)
 {
   rmm::device_uvector<statistics_group> frag_stats_group(frag_stats.size(), stream);
 
-  gpu::InitFragmentStatistics(frag_stats_group, frags, stream);
+  InitFragmentStatistics(frag_stats_group, frags, stream);
   detail::calculate_group_statistics<detail::io_file_format::PARQUET>(
     frag_stats.data(), frag_stats_group.data(), frag_stats.size(), stream, int96_timestamps);
   stream.synchronize();
@@ -1008,8 +1029,8 @@ size_t max_compression_output_size(Compression codec, uint32_t compression_block
   return compress_max_output_chunk_size(to_nvcomp_compression_type(codec), compression_blocksize);
 }
 
-auto init_page_sizes(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
-                     device_span<gpu::parquet_column_device_view const> col_desc,
+auto init_page_sizes(hostdevice_2dvector<EncColumnChunk>& chunks,
+                     device_span<parquet_column_device_view const> col_desc,
                      uint32_t num_columns,
                      size_t max_page_size_bytes,
                      size_type max_page_size_rows,
@@ -1021,19 +1042,19 @@ auto init_page_sizes(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
 
   chunks.host_to_device_async(stream);
   // Calculate number of pages and store in respective chunks
-  gpu::InitEncoderPages(chunks,
-                        {},
-                        {},
-                        {},
-                        col_desc,
-                        num_columns,
-                        max_page_size_bytes,
-                        max_page_size_rows,
-                        page_alignment(compression_codec),
-                        write_v2_headers,
-                        nullptr,
-                        nullptr,
-                        stream);
+  InitEncoderPages(chunks,
+                   {},
+                   {},
+                   {},
+                   col_desc,
+                   num_columns,
+                   max_page_size_bytes,
+                   max_page_size_rows,
+                   page_alignment(compression_codec),
+                   write_v2_headers,
+                   nullptr,
+                   nullptr,
+                   stream);
   chunks.device_to_host_sync(stream);
 
   int num_pages = 0;
@@ -1046,19 +1067,19 @@ auto init_page_sizes(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
   // Now that we know the number of pages, allocate an array to hold per page size and get it
   // populated
   cudf::detail::hostdevice_vector<size_type> page_sizes(num_pages, stream);
-  gpu::InitEncoderPages(chunks,
-                        {},
-                        page_sizes,
-                        {},
-                        col_desc,
-                        num_columns,
-                        max_page_size_bytes,
-                        max_page_size_rows,
-                        page_alignment(compression_codec),
-                        write_v2_headers,
-                        nullptr,
-                        nullptr,
-                        stream);
+  InitEncoderPages(chunks,
+                   {},
+                   page_sizes,
+                   {},
+                   col_desc,
+                   num_columns,
+                   max_page_size_bytes,
+                   max_page_size_rows,
+                   page_alignment(compression_codec),
+                   write_v2_headers,
+                   nullptr,
+                   nullptr,
+                   stream);
   page_sizes.device_to_host_sync(stream);
 
   // Get per-page max compressed size
@@ -1072,26 +1093,26 @@ auto init_page_sizes(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
   comp_page_sizes.host_to_device_async(stream);
 
   // Use per-page max compressed size to calculate chunk.compressed_size
-  gpu::InitEncoderPages(chunks,
-                        {},
-                        {},
-                        comp_page_sizes,
-                        col_desc,
-                        num_columns,
-                        max_page_size_bytes,
-                        max_page_size_rows,
-                        page_alignment(compression_codec),
-                        write_v2_headers,
-                        nullptr,
-                        nullptr,
-                        stream);
+  InitEncoderPages(chunks,
+                   {},
+                   {},
+                   comp_page_sizes,
+                   col_desc,
+                   num_columns,
+                   max_page_size_bytes,
+                   max_page_size_rows,
+                   page_alignment(compression_codec),
+                   write_v2_headers,
+                   nullptr,
+                   nullptr,
+                   stream);
   chunks.device_to_host_sync(stream);
   return comp_page_sizes;
 }
 
 size_t max_page_bytes(Compression compression, size_t max_page_size_bytes)
 {
-  if (compression == parquet::Compression::UNCOMPRESSED) { return max_page_size_bytes; }
+  if (compression == Compression::UNCOMPRESSED) { return max_page_size_bytes; }
 
   auto const ncomp_type   = to_nvcomp_compression_type(compression);
   auto const nvcomp_limit = nvcomp::is_compression_disabled(ncomp_type)
@@ -1104,9 +1125,9 @@ size_t max_page_bytes(Compression compression, size_t max_page_size_bytes)
 }
 
 std::pair<std::vector<rmm::device_uvector<size_type>>, std::vector<rmm::device_uvector<size_type>>>
-build_chunk_dictionaries(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
-                         host_span<gpu::parquet_column_device_view const> col_desc,
-                         device_2dspan<gpu::PageFragment const> frags,
+build_chunk_dictionaries(hostdevice_2dvector<EncColumnChunk>& chunks,
+                         host_span<parquet_column_device_view const> col_desc,
+                         device_2dspan<PageFragment const> frags,
                          Compression compression,
                          dictionary_policy dict_policy,
                          size_t max_dict_size,
@@ -1130,7 +1151,7 @@ build_chunk_dictionaries(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
   }
 
   // Allocate slots for each chunk
-  std::vector<rmm::device_uvector<gpu::slot_type>> hash_maps_storage;
+  std::vector<rmm::device_uvector<slot_type>> hash_maps_storage;
   hash_maps_storage.reserve(h_chunks.size());
   for (auto& chunk : h_chunks) {
     if (col_desc[chunk.col_desc_id].physical_type == Type::BOOLEAN ||
@@ -1149,8 +1170,8 @@ build_chunk_dictionaries(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
 
   chunks.host_to_device_async(stream);
 
-  gpu::initialize_chunk_hash_maps(chunks.device_view().flat_view(), stream);
-  gpu::populate_chunk_hash_maps(frags, stream);
+  initialize_chunk_hash_maps(chunks.device_view().flat_view(), stream);
+  populate_chunk_hash_maps(frags, stream);
 
   chunks.device_to_host_sync(stream);
 
@@ -1197,8 +1218,8 @@ build_chunk_dictionaries(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
     chunk.dict_index          = inserted_dict_index.data();
   }
   chunks.host_to_device_async(stream);
-  gpu::collect_map_entries(chunks.device_view().flat_view(), stream);
-  gpu::get_dictionary_indices(frags, stream);
+  collect_map_entries(chunks.device_view().flat_view(), stream);
+  get_dictionary_indices(frags, stream);
 
   return std::pair(std::move(dict_data), std::move(dict_index));
 }
@@ -1221,9 +1242,9 @@ build_chunk_dictionaries(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
  * @param write_v2_headers True if version 2 page headers are to be written
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
-void init_encoder_pages(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
-                        device_span<gpu::parquet_column_device_view const> col_desc,
-                        device_span<gpu::EncPage> pages,
+void init_encoder_pages(hostdevice_2dvector<EncColumnChunk>& chunks,
+                        device_span<parquet_column_device_view const> col_desc,
+                        device_span<EncPage> pages,
                         cudf::detail::hostdevice_vector<size_type>& comp_page_sizes,
                         statistics_chunk* page_stats,
                         statistics_chunk* frag_stats,
@@ -1286,8 +1307,8 @@ void init_encoder_pages(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
  * @param write_v2_headers True if V2 page headers should be written
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
-void encode_pages(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
-                  device_span<gpu::EncPage> pages,
+void encode_pages(hostdevice_2dvector<EncColumnChunk>& chunks,
+                  device_span<EncPage> pages,
                   uint32_t pages_in_batch,
                   uint32_t first_page_in_batch,
                   uint32_t rowgroups_in_batch,
@@ -1308,8 +1329,7 @@ void encode_pages(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
       ? device_span<statistics_chunk const>(page_stats + first_page_in_batch, pages_in_batch)
       : device_span<statistics_chunk const>();
 
-  uint32_t max_comp_pages =
-    (compression != parquet::Compression::UNCOMPRESSED) ? pages_in_batch : 0;
+  uint32_t max_comp_pages = (compression != Compression::UNCOMPRESSED) ? pages_in_batch : 0;
 
   rmm::device_uvector<device_span<uint8_t const>> comp_in(max_comp_pages, stream);
   rmm::device_uvector<device_span<uint8_t>> comp_out(max_comp_pages, stream);
@@ -1319,9 +1339,9 @@ void encode_pages(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
                comp_res.end(),
                compression_result{0, compression_status::FAILURE});
 
-  gpu::EncodePages(batch_pages, write_v2_headers, comp_in, comp_out, comp_res, stream);
+  EncodePages(batch_pages, write_v2_headers, comp_in, comp_out, comp_res, stream);
   switch (compression) {
-    case parquet::Compression::SNAPPY:
+    case Compression::SNAPPY:
       if (nvcomp::is_compression_disabled(nvcomp::compression_type::SNAPPY)) {
         gpu_snap(comp_in, comp_out, comp_res, stream);
       } else {
@@ -1329,7 +1349,7 @@ void encode_pages(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
           nvcomp::compression_type::SNAPPY, comp_in, comp_out, comp_res, stream);
       }
       break;
-    case parquet::Compression::ZSTD: {
+    case Compression::ZSTD: {
       if (auto const reason = nvcomp::is_compression_disabled(nvcomp::compression_type::ZSTD);
           reason) {
         CUDF_FAIL("Compression error: " + reason.value());
@@ -1338,7 +1358,7 @@ void encode_pages(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
 
       break;
     }
-    case parquet::Compression::UNCOMPRESSED: break;
+    case Compression::UNCOMPRESSED: break;
     default: CUDF_FAIL("invalid compression type");
   }
 
@@ -1378,7 +1398,7 @@ void encode_pages(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
  * @param column_index_truncate_length maximum length of min or max values in column index, in bytes
  * @return Computed buffer size needed to encode the column index
  */
-size_t column_index_buffer_size(gpu::EncColumnChunk* ck, int32_t column_index_truncate_length)
+size_t column_index_buffer_size(EncColumnChunk* ck, int32_t column_index_truncate_length)
 {
   // encoding the column index for a given chunk requires:
   //   each list (4 of them) requires 6 bytes of overhead
@@ -1450,6 +1470,7 @@ void fill_table_meta(std::unique_ptr<table_input_metadata> const& table_meta)
  * @param max_dictionary_size Maximum dictionary size, in bytes
  * @param single_write_mode Flag to indicate that we are guaranteeing a single table write
  * @param int96_timestamps Flag to indicate if timestamps will be written as INT96
+ * @param utc_timestamps Flag to indicate if timestamps are UTC
  * @param write_v2_headers True if V2 page headers are to be written
  * @param out_sink Sink for checking if device write is supported, should not be used to write any
  *        data in this function
@@ -1474,12 +1495,14 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
                                    size_t max_dictionary_size,
                                    single_write_mode write_mode,
                                    bool int96_timestamps,
+                                   bool utc_timestamps,
                                    bool write_v2_headers,
                                    host_span<std::unique_ptr<data_sink> const> out_sink,
                                    rmm::cuda_stream_view stream)
 {
-  auto vec         = table_to_linked_columns(input);
-  auto schema_tree = construct_schema_tree(vec, table_meta, write_mode, int96_timestamps);
+  auto vec = table_to_linked_columns(input);
+  auto schema_tree =
+    construct_schema_tree(vec, table_meta, write_mode, int96_timestamps, utc_timestamps);
   // Construct parquet_column_views from the schema tree leaf nodes.
   std::vector<parquet_column_view> parquet_columns;
 
@@ -1499,8 +1522,8 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
   std::vector<SchemaElement> this_table_schema(schema_tree.begin(), schema_tree.end());
 
   // Initialize column description
-  cudf::detail::hostdevice_vector<gpu::parquet_column_device_view> col_desc(parquet_columns.size(),
-                                                                            stream);
+  cudf::detail::hostdevice_vector<parquet_column_device_view> col_desc(parquet_columns.size(),
+                                                                       stream);
   std::transform(
     parquet_columns.begin(), parquet_columns.end(), col_desc.host_ptr(), [&](auto const& pcol) {
       return pcol.get_device_view(stream);
@@ -1576,7 +1599,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
 
   auto d_part_frag_offset = cudf::detail::make_device_uvector_async(
     part_frag_offset, stream, rmm::mr::get_current_device_resource());
-  cudf::detail::hostdevice_2dvector<gpu::PageFragment> row_group_fragments(
+  cudf::detail::hostdevice_2dvector<PageFragment> row_group_fragments(
     num_columns, num_fragments, stream);
 
   // Create table_device_view so that corresponding column_device_view data
@@ -1588,7 +1611,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
   if (num_fragments != 0) {
     // Move column info to device
     col_desc.host_to_device_async(stream);
-    leaf_column_views = create_leaf_column_device_views<gpu::parquet_column_device_view>(
+    leaf_column_views = create_leaf_column_device_views<parquet_column_device_view>(
       col_desc, *parent_column_table_device_view, stream);
 
     init_row_group_fragments(row_group_fragments,
@@ -1662,7 +1685,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
 
   // Initialize row groups and column chunks
   auto const num_chunks = num_rowgroups * num_columns;
-  hostdevice_2dvector<gpu::EncColumnChunk> chunks(num_rowgroups, num_columns, stream);
+  hostdevice_2dvector<EncColumnChunk> chunks(num_rowgroups, num_columns, stream);
 
   // total fragments per column (in case they are non-uniform)
   std::vector<size_type> frags_per_column(num_columns, 0);
@@ -1678,7 +1701,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
       row_group.total_byte_size = 0;
       row_group.columns.resize(num_columns);
       for (int c = 0; c < num_columns; c++) {
-        gpu::EncColumnChunk& ck = chunks[r + first_rg_in_part[p]][c];
+        EncColumnChunk& ck = chunks[r + first_rg_in_part[p]][c];
 
         ck                   = {};
         ck.col_desc          = col_desc.device_ptr() + c;
@@ -1700,7 +1723,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
             return l + r.num_values;
           });
         ck.plain_data_size = std::accumulate(
-          chunk_fragments.begin(), chunk_fragments.end(), 0, [](int sum, gpu::PageFragment frag) {
+          chunk_fragments.begin(), chunk_fragments.end(), 0, [](int sum, PageFragment frag) {
             return sum + frag.fragment_data_size;
           });
         auto& column_chunk_meta          = row_group.columns[c].meta_data;
@@ -1731,7 +1754,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
     frags_per_column.empty() ? 0 : frag_offsets.back() + frags_per_column.back();
 
   rmm::device_uvector<statistics_chunk> frag_stats(0, stream);
-  cudf::detail::hostdevice_vector<gpu::PageFragment> page_fragments(total_frags, stream);
+  cudf::detail::hostdevice_vector<PageFragment> page_fragments(total_frags, stream);
 
   // update fragments and/or prepare for fragment statistics calculation if necessary
   if (total_frags != 0) {
@@ -1749,9 +1772,9 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
           auto const& row_group = agg_meta->file(p).row_groups[global_r];
           uint32_t const fragments_in_chunk =
             util::div_rounding_up_unsafe(row_group.num_rows, frag_size);
-          gpu::EncColumnChunk& ck = chunks[r + first_rg_in_part[p]][c];
-          ck.fragments            = page_fragments.device_ptr(frag_offset);
-          ck.first_fragment       = frag_offset;
+          EncColumnChunk& ck = chunks[r + first_rg_in_part[p]][c];
+          ck.fragments       = page_fragments.device_ptr(frag_offset);
+          ck.first_fragment  = frag_offset;
 
           // update the chunk pointer here for each fragment in chunk.fragments
           for (uint32_t i = 0; i < fragments_in_chunk; i++) {
@@ -1817,8 +1840,8 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
     size_t comp_rowgroup_size = 0;
     if (r < num_rowgroups) {
       for (int i = 0; i < num_columns; i++) {
-        gpu::EncColumnChunk* ck = &chunks[r][i];
-        ck->first_page          = num_pages;
+        EncColumnChunk* ck = &chunks[r][i];
+        ck->first_page     = num_pages;
         num_pages += ck->num_pages;
         pages_in_batch += ck->num_pages;
         rowgroup_size += ck->bfr_size;
@@ -1850,7 +1873,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
   }
 
   // Clear compressed buffer size if compression has been turned off
-  if (compression == parquet::Compression::UNCOMPRESSED) { max_comp_bfr_size = 0; }
+  if (compression == Compression::UNCOMPRESSED) { max_comp_bfr_size = 0; }
 
   // Initialize data pointers in batch
   uint32_t const num_stats_bfr =
@@ -1864,7 +1887,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
                               stream);
 
   rmm::device_buffer col_idx_bfr(column_index_bfr_size, stream);
-  rmm::device_uvector<gpu::EncPage> pages(num_pages, stream);
+  rmm::device_uvector<EncPage> pages(num_pages, stream);
 
   // This contains stats for both the pages and the rowgroups. TODO: make them separate.
   rmm::device_uvector<statistics_chunk> page_stats(num_stats_bfr, stream);
@@ -1874,10 +1897,10 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
     auto bfr_c = static_cast<uint8_t*>(comp_bfr.data());
     for (auto j = 0; j < batch_list[b]; j++, r++) {
       for (auto i = 0; i < num_columns; i++) {
-        gpu::EncColumnChunk& ck = chunks[r][i];
-        ck.uncompressed_bfr     = bfr;
-        ck.compressed_bfr       = bfr_c;
-        ck.column_index_blob    = bfr_i;
+        EncColumnChunk& ck   = chunks[r][i];
+        ck.uncompressed_bfr  = bfr;
+        ck.compressed_bfr    = bfr_c;
+        ck.column_index_blob = bfr_i;
         bfr += ck.bfr_size;
         bfr_c += ck.compressed_size;
         if (stats_granularity == statistics_freq::STATISTICS_COLUMN) {
@@ -1960,7 +1983,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
         if (ck.ck_stat_size != 0) {
           std::vector<uint8_t> const stats_blob = cudf::detail::make_std_vector_sync(
             device_span<uint8_t const>(dev_bfr, ck.ck_stat_size), stream);
-          cudf::io::parquet::CompactProtocolReader cp(stats_blob.data(), stats_blob.size());
+          CompactProtocolReader cp(stats_blob.data(), stats_blob.size());
           cp.read(&column_chunk_meta.statistics);
           need_sync = true;
         }
@@ -2009,6 +2032,7 @@ writer::impl::impl(std::vector<std::unique_ptr<data_sink>> sinks,
     _max_dictionary_size(options.get_max_dictionary_size()),
     _max_page_fragment_size(options.get_max_page_fragment_size()),
     _int96_timestamps(options.is_enabled_int96_timestamps()),
+    _utc_timestamps(options.is_enabled_utc_timestamps()),
     _write_v2_headers(options.is_enabled_write_v2_headers()),
     _column_index_truncate_length(options.get_column_index_truncate_length()),
     _kv_meta(options.get_key_value_metadata()),
@@ -2037,6 +2061,7 @@ writer::impl::impl(std::vector<std::unique_ptr<data_sink>> sinks,
     _max_dictionary_size(options.get_max_dictionary_size()),
     _max_page_fragment_size(options.get_max_page_fragment_size()),
     _int96_timestamps(options.is_enabled_int96_timestamps()),
+    _utc_timestamps(options.is_enabled_utc_timestamps()),
     _write_v2_headers(options.is_enabled_write_v2_headers()),
     _column_index_truncate_length(options.get_column_index_truncate_length()),
     _kv_meta(options.get_key_value_metadata()),
@@ -2114,6 +2139,7 @@ void writer::impl::write(table_view const& input, std::vector<partition_info> co
                                            _max_dictionary_size,
                                            _single_write_mode,
                                            _int96_timestamps,
+                                           _utc_timestamps,
                                            _write_v2_headers,
                                            _out_sink,
                                            _stream);
@@ -2142,8 +2168,8 @@ void writer::impl::write(table_view const& input, std::vector<partition_info> co
 
 void writer::impl::write_parquet_data_to_sink(
   std::unique_ptr<aggregate_writer_metadata>& updated_agg_meta,
-  device_span<gpu::EncPage const> pages,
-  host_2dspan<gpu::EncColumnChunk const> chunks,
+  device_span<EncPage const> pages,
+  host_2dspan<EncColumnChunk const> chunks,
   host_span<size_t const> global_rowgroup_base,
   host_span<int const> first_rg_in_part,
   host_span<size_type const> batch_list,
@@ -2209,7 +2235,7 @@ void writer::impl::write_parquet_data_to_sink(
         int const global_r    = global_rowgroup_base[p] + r - first_rg_in_part[p];
         auto const& row_group = _agg_meta->file(p).row_groups[global_r];
         for (std::size_t i = 0; i < num_columns; i++) {
-          gpu::EncColumnChunk const& ck = chunks[r][i];
+          EncColumnChunk const& ck      = chunks[r][i];
           auto const& column_chunk_meta = row_group.columns[i].meta_data;
 
           // start transfer of the column index
@@ -2377,6 +2403,15 @@ std::unique_ptr<std::vector<uint8_t>> writer::merge_row_group_metadata(
     }
   }
 
+  // Remove any LogicalType::UNKNOWN annotations that were passed in as they can confuse
+  // column type inferencing.
+  // See https://github.com/rapidsai/cudf/pull/14264#issuecomment-1778311615
+  for (auto& se : md.schema) {
+    if (se.logical_type.has_value() && se.logical_type.value().type == LogicalType::UNKNOWN) {
+      se.logical_type = thrust::nullopt;
+    }
+  }
+
   // Thrift-encode the resulting output
   file_header_s fhdr;
   file_ender_s fendr;
@@ -2392,7 +2427,4 @@ std::unique_ptr<std::vector<uint8_t>> writer::merge_row_group_metadata(
   return std::make_unique<std::vector<uint8_t>>(std::move(output));
 }
 
-}  // namespace parquet
-}  // namespace detail
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/writer_impl.hpp b/cpp/src/io/parquet/writer_impl.hpp
index 89ef85ba2bd..3415205d179 100644
--- a/cpp/src/io/parquet/writer_impl.hpp
+++ b/cpp/src/io/parquet/writer_impl.hpp
@@ -38,15 +38,11 @@
 #include <string>
 #include <vector>
 
-namespace cudf {
-namespace io {
-namespace detail {
-namespace parquet {
+namespace cudf::io::parquet::detail {
+
 // Forward internal classes
 struct aggregate_writer_metadata;
 
-using namespace cudf::io::parquet;
-using namespace cudf::io;
 using cudf::detail::device_2dspan;
 using cudf::detail::host_2dspan;
 using cudf::detail::hostdevice_2dvector;
@@ -66,7 +62,7 @@ class writer::impl {
    */
   explicit impl(std::vector<std::unique_ptr<data_sink>> sinks,
                 parquet_writer_options const& options,
-                single_write_mode mode,
+                cudf::io::detail::single_write_mode mode,
                 rmm::cuda_stream_view stream);
 
   /**
@@ -79,7 +75,7 @@ class writer::impl {
    */
   explicit impl(std::vector<std::unique_ptr<data_sink>> sinks,
                 chunked_parquet_writer_options const& options,
-                single_write_mode mode,
+                cudf::io::detail::single_write_mode mode,
                 rmm::cuda_stream_view stream);
 
   /**
@@ -139,8 +135,8 @@ class writer::impl {
    * @param[out] bounce_buffer Temporary host output buffer
    */
   void write_parquet_data_to_sink(std::unique_ptr<aggregate_writer_metadata>& updated_agg_meta,
-                                  device_span<gpu::EncPage const> pages,
-                                  host_2dspan<gpu::EncColumnChunk const> chunks,
+                                  device_span<EncPage const> pages,
+                                  host_2dspan<EncColumnChunk const> chunks,
                                   host_span<size_t const> global_rowgroup_base,
                                   host_span<int const> first_rg_in_part,
                                   host_span<size_type const> batch_list,
@@ -161,12 +157,14 @@ class writer::impl {
   size_t const _max_dictionary_size;
   std::optional<size_type> const _max_page_fragment_size;
   bool const _int96_timestamps;
+  bool const _utc_timestamps;
   bool const _write_v2_headers;
   int32_t const _column_index_truncate_length;
   std::vector<std::map<std::string, std::string>> const _kv_meta;  // Optional user metadata.
-  single_write_mode const _single_write_mode;  // Special parameter only used by `write()` to
-                                               // indicate that we are guaranteeing a single table
-                                               // write. This enables some internal optimizations.
+  cudf::io::detail::single_write_mode const
+    _single_write_mode;  // Special parameter only used by `write()` to
+                         // indicate that we are guaranteeing a single table
+                         // write. This enables some internal optimizations.
   std::vector<std::unique_ptr<data_sink>> const _out_sink;
 
   // Internal states, filled during `write()` and written to sink during `write` and `close()`.
@@ -180,7 +178,4 @@ class writer::impl {
   bool _closed                = false;  // To track if the output has been written to sink.
 };
 
-}  // namespace parquet
-}  // namespace detail
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/utilities/column_buffer.cpp b/cpp/src/io/utilities/column_buffer.cpp
index f3a43cbc63c..dd049d401cf 100644
--- a/cpp/src/io/utilities/column_buffer.cpp
+++ b/cpp/src/io/utilities/column_buffer.cpp
@@ -51,19 +51,21 @@ std::unique_ptr<column> gather_column_buffer::make_string_column_impl(rmm::cuda_
   return make_strings_column(*_strings, stream, _mr);
 }
 
-void inline_column_buffer::allocate_strings_data(rmm::cuda_stream_view stream)
+void cudf::io::detail::inline_column_buffer::allocate_strings_data(rmm::cuda_stream_view stream)
 {
   CUDF_EXPECTS(type.id() == type_id::STRING, "allocate_strings_data called for non-string column");
   // size + 1 for final offset. _string_data will be initialized later.
   _data = create_data(data_type{type_id::INT32}, size + 1, stream, _mr);
 }
 
-void inline_column_buffer::create_string_data(size_t num_bytes, rmm::cuda_stream_view stream)
+void cudf::io::detail::inline_column_buffer::create_string_data(size_t num_bytes,
+                                                                rmm::cuda_stream_view stream)
 {
   _string_data = rmm::device_buffer(num_bytes, stream, _mr);
 }
 
-std::unique_ptr<column> inline_column_buffer::make_string_column_impl(rmm::cuda_stream_view stream)
+std::unique_ptr<column> cudf::io::detail::inline_column_buffer::make_string_column_impl(
+  rmm::cuda_stream_view stream)
 {
   // no need for copies, just transfer ownership of the data_buffers to the columns
   auto const state = mask_state::UNALLOCATED;
@@ -324,7 +326,7 @@ std::unique_ptr<column> empty_like(column_buffer_base<string_policy>& buffer,
 }
 
 using pointer_type = gather_column_buffer;
-using string_type  = inline_column_buffer;
+using string_type  = cudf::io::detail::inline_column_buffer;
 
 using pointer_column_buffer = column_buffer_base<pointer_type>;
 using string_column_buffer  = column_buffer_base<string_type>;
diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp
index 7a7121aa91d..a466ef84133 100644
--- a/cpp/src/io/utilities/datasource.cpp
+++ b/cpp/src/io/utilities/datasource.cpp
@@ -360,6 +360,11 @@ class user_datasource_wrapper : public datasource {
     return source->supports_device_read();
   }
 
+  [[nodiscard]] bool is_device_read_preferred(size_t size) const override
+  {
+    return source->is_device_read_preferred(size);
+  }
+
   size_t device_read(size_t offset,
                      size_t size,
                      uint8_t* dst,
@@ -375,8 +380,18 @@ class user_datasource_wrapper : public datasource {
     return source->device_read(offset, size, stream);
   }
 
+  std::future<size_t> device_read_async(size_t offset,
+                                        size_t size,
+                                        uint8_t* dst,
+                                        rmm::cuda_stream_view stream) override
+  {
+    return source->device_read_async(offset, size, dst, stream);
+  }
+
   [[nodiscard]] size_t size() const override { return source->size(); }
 
+  [[nodiscard]] bool is_empty() const override { return source->is_empty(); }
+
  private:
   datasource* const source;  ///< A non-owning pointer to the user-implemented datasource
 };
diff --git a/cpp/src/jit/parser.cpp b/cpp/src/jit/parser.cpp
index 1bc126d3be9..e59c1089318 100644
--- a/cpp/src/jit/parser.cpp
+++ b/cpp/src/jit/parser.cpp
@@ -114,6 +114,7 @@ std::string ptx_parser::parse_instruction(std::string const& src)
   size_t start                      = 0;
   size_t stop                       = 0;
   bool is_instruction               = true;
+  bool is_pragma_instruction        = false;
   bool is_param_loading_instruction = false;
   std::string constraint;
   std::string register_type;
@@ -181,6 +182,9 @@ std::string ptx_parser::parse_instruction(std::string const& src)
                "value through the first function parameter. Thus the `st.param.***` instructions "
                "are not processed. *** */" +
                "\");" + original_code;  // Our port does not support return value;
+      } else if (piece.find(".pragma") != std::string::npos) {
+        is_pragma_instruction = true;
+        output += " " + piece;
       } else if (piece[0] == '@') {
         output += " @" + remove_nonalphanumeric(piece.substr(1, piece.size() - 1));
       } else {
@@ -200,6 +204,17 @@ std::string ptx_parser::parse_instruction(std::string const& src)
         }
         // Here we get to see the actual type of the input arguments.
         input_arg_list[remove_nonalphanumeric(piece)] = register_type_to_cpp_type(register_type);
+      } else if (is_pragma_instruction) {
+        // quote any string
+        std::string transformed_piece;
+        for (const auto& c : piece) {
+          if (c == '"') {
+            transformed_piece += "\\\"";
+          } else {
+            transformed_piece += c;
+          }
+        }
+        output += transformed_piece;
       } else {
         output += escape_percent(std::string(src, start, stop - start));
       }
diff --git a/cpp/src/strings/json/json_path.cu b/cpp/src/json/json_path.cu
similarity index 98%
rename from cpp/src/strings/json/json_path.cu
rename to cpp/src/json/json_path.cu
index c56752f5429..8217e34723c 100644
--- a/cpp/src/strings/json/json_path.cu
+++ b/cpp/src/json/json_path.cu
@@ -20,9 +20,9 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/json/json.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/detail/utilities.hpp>
-#include <cudf/strings/json.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/types.hpp>
@@ -41,7 +41,6 @@
 #include <thrust/tuple.h>
 
 namespace cudf {
-namespace strings {
 namespace detail {
 
 namespace {
@@ -224,7 +223,9 @@ enum json_element_type { NONE, OBJECT, ARRAY, VALUE };
 class json_state : private parser {
  public:
   __device__ json_state() : parser() {}
-  __device__ json_state(char const* _input, int64_t _input_len, get_json_object_options _options)
+  __device__ json_state(char const* _input,
+                        int64_t _input_len,
+                        cudf::get_json_object_options _options)
     : parser(_input, _input_len),
 
       options(_options)
@@ -956,9 +957,6 @@ __launch_bounds__(block_size) __global__
   }
 }
 
-/**
- * @copydoc cudf::strings::detail::get_json_object
- */
 std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& col,
                                               cudf::string_scalar const& json_path,
                                               get_json_object_options options,
@@ -1011,7 +1009,7 @@ std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& c
     cudf::detail::get_value<size_type>(offsets_view, col.size(), stream);
 
   // allocate output string column
-  auto chars = create_chars_child_column(output_size, stream, mr);
+  auto chars = cudf::strings::detail::create_chars_child_column(output_size, stream, mr);
 
   // potential optimization : if we know that all outputs are valid, we could skip creating
   // the validity mask altogether
@@ -1041,17 +1039,14 @@ std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& c
 }  // namespace
 }  // namespace detail
 
-/**
- * @copydoc cudf::strings::get_json_object
- */
 std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& col,
                                               cudf::string_scalar const& json_path,
                                               get_json_object_options options,
+                                              rmm::cuda_stream_view stream,
                                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::get_json_object(col, json_path, options, cudf::get_default_stream(), mr);
+  return detail::get_json_object(col, json_path, options, stream, mr);
 }
 
-}  // namespace strings
 }  // namespace cudf
diff --git a/cpp/src/lists/combine/concatenate_list_elements.cu b/cpp/src/lists/combine/concatenate_list_elements.cu
index fbe297765f8..99dbd55678b 100644
--- a/cpp/src/lists/combine/concatenate_list_elements.cu
+++ b/cpp/src/lists/combine/concatenate_list_elements.cu
@@ -271,10 +271,11 @@ std::unique_ptr<column> concatenate_list_elements(column_view const& input,
  */
 std::unique_ptr<column> concatenate_list_elements(column_view const& input,
                                                   concatenate_null_policy null_policy,
+                                                  rmm::cuda_stream_view stream,
                                                   rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::concatenate_list_elements(input, null_policy, cudf::get_default_stream(), mr);
+  return detail::concatenate_list_elements(input, null_policy, stream, mr);
 }
 
 }  // namespace lists
diff --git a/cpp/src/lists/combine/concatenate_rows.cu b/cpp/src/lists/combine/concatenate_rows.cu
index 658538b0195..49be7b5ff17 100644
--- a/cpp/src/lists/combine/concatenate_rows.cu
+++ b/cpp/src/lists/combine/concatenate_rows.cu
@@ -305,10 +305,11 @@ std::unique_ptr<column> concatenate_rows(table_view const& input,
  */
 std::unique_ptr<column> concatenate_rows(table_view const& input,
                                          concatenate_null_policy null_policy,
+                                         rmm::cuda_stream_view stream,
                                          rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::concatenate_rows(input, null_policy, cudf::get_default_stream(), mr);
+  return detail::concatenate_rows(input, null_policy, stream, mr);
 }
 
 }  // namespace lists
diff --git a/cpp/src/lists/contains.cu b/cpp/src/lists/contains.cu
index df1d043bdb6..cd2bc493bc7 100644
--- a/cpp/src/lists/contains.cu
+++ b/cpp/src/lists/contains.cu
@@ -16,6 +16,7 @@
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/iterator.cuh>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/lists/detail/contains.hpp>
 #include <cudf/lists/detail/lists_column_factories.hpp>
@@ -274,12 +275,13 @@ std::unique_ptr<column> index_of(lists_column_view const& lists,
                                  rmm::mr::device_memory_resource* mr)
 {
   if (!search_key.is_valid(stream)) {
-    return make_numeric_column(data_type{cudf::type_to_id<size_type>()},
-                               lists.size(),
-                               cudf::create_null_mask(lists.size(), mask_state::ALL_NULL, mr),
-                               lists.size(),
-                               stream,
-                               mr);
+    return make_numeric_column(
+      data_type{cudf::type_to_id<size_type>()},
+      lists.size(),
+      cudf::detail::create_null_mask(lists.size(), mask_state::ALL_NULL, stream, mr),
+      lists.size(),
+      stream,
+      mr);
   }
   if (lists.size() == 0) {
     return make_numeric_column(
@@ -287,7 +289,7 @@ std::unique_ptr<column> index_of(lists_column_view const& lists,
   }
 
   auto search_key_col = cudf::make_column_from_scalar(search_key, lists.size(), stream, mr);
-  return index_of(lists, search_key_col->view(), find_option, stream, mr);
+  return detail::index_of(lists, search_key_col->view(), find_option, stream, mr);
 }
 
 std::unique_ptr<column> index_of(lists_column_view const& lists,
@@ -306,11 +308,11 @@ std::unique_ptr<column> contains(lists_column_view const& lists,
                                  rmm::cuda_stream_view stream,
                                  rmm::mr::device_memory_resource* mr)
 {
-  auto key_indices = index_of(lists,
-                              search_key,
-                              duplicate_find_option::FIND_FIRST,
-                              stream,
-                              rmm::mr::get_current_device_resource());
+  auto key_indices = detail::index_of(lists,
+                                      search_key,
+                                      duplicate_find_option::FIND_FIRST,
+                                      stream,
+                                      rmm::mr::get_current_device_resource());
   return to_contains(std::move(key_indices), stream, mr);
 }
 
@@ -322,11 +324,11 @@ std::unique_ptr<column> contains(lists_column_view const& lists,
   CUDF_EXPECTS(search_keys.size() == lists.size(),
                "Number of search keys must match list column size.");
 
-  auto key_indices = index_of(lists,
-                              search_keys,
-                              duplicate_find_option::FIND_FIRST,
-                              stream,
-                              rmm::mr::get_current_device_resource());
+  auto key_indices = detail::index_of(lists,
+                                      search_keys,
+                                      duplicate_find_option::FIND_FIRST,
+                                      stream,
+                                      rmm::mr::get_current_device_resource());
   return to_contains(std::move(key_indices), stream, mr);
 }
 
@@ -337,7 +339,7 @@ std::unique_ptr<column> contains_nulls(lists_column_view const& lists,
   auto const lists_cv      = lists.parent();
   auto output              = make_numeric_column(data_type{type_to_id<bool>()},
                                     lists.size(),
-                                    copy_bitmask(lists_cv, stream, mr),
+                                    cudf::detail::copy_bitmask(lists_cv, stream, mr),
                                     lists_cv.null_count(),
                                     stream,
                                     mr);
@@ -364,43 +366,48 @@ std::unique_ptr<column> contains_nulls(lists_column_view const& lists,
 
 std::unique_ptr<column> contains(lists_column_view const& lists,
                                  cudf::scalar const& search_key,
+                                 rmm::cuda_stream_view stream,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::contains(lists, search_key, cudf::get_default_stream(), mr);
+  return detail::contains(lists, search_key, stream, mr);
 }
 
 std::unique_ptr<column> contains(lists_column_view const& lists,
                                  column_view const& search_keys,
+                                 rmm::cuda_stream_view stream,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::contains(lists, search_keys, cudf::get_default_stream(), mr);
+  return detail::contains(lists, search_keys, stream, mr);
 }
 
 std::unique_ptr<column> contains_nulls(lists_column_view const& lists,
+                                       rmm::cuda_stream_view stream,
                                        rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::contains_nulls(lists, cudf::get_default_stream(), mr);
+  return detail::contains_nulls(lists, stream, mr);
 }
 
 std::unique_ptr<column> index_of(lists_column_view const& lists,
                                  cudf::scalar const& search_key,
                                  duplicate_find_option find_option,
+                                 rmm::cuda_stream_view stream,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::index_of(lists, search_key, find_option, cudf::get_default_stream(), mr);
+  return detail::index_of(lists, search_key, find_option, stream, mr);
 }
 
 std::unique_ptr<column> index_of(lists_column_view const& lists,
                                  column_view const& search_keys,
                                  duplicate_find_option find_option,
+                                 rmm::cuda_stream_view stream,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::index_of(lists, search_keys, find_option, cudf::get_default_stream(), mr);
+  return detail::index_of(lists, search_keys, find_option, stream, mr);
 }
 
 }  // namespace cudf::lists
diff --git a/cpp/src/lists/copying/concatenate.cu b/cpp/src/lists/copying/concatenate.cu
index ddd0dfbe084..5407b88236f 100644
--- a/cpp/src/lists/copying/concatenate.cu
+++ b/cpp/src/lists/copying/concatenate.cu
@@ -22,6 +22,7 @@
 #include <cudf/detail/concatenate_masks.hpp>
 #include <cudf/detail/get_value.cuh>
 #include <cudf/detail/null_mask.cuh>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -123,8 +124,8 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns,
   // if any of the input columns have nulls, construct the output mask
   bool const has_nulls =
     std::any_of(columns.begin(), columns.end(), [](auto const& col) { return col.has_nulls(); });
-  rmm::device_buffer null_mask = create_null_mask(
-    total_list_count, has_nulls ? mask_state::UNINITIALIZED : mask_state::UNALLOCATED);
+  rmm::device_buffer null_mask = cudf::detail::create_null_mask(
+    total_list_count, has_nulls ? mask_state::UNINITIALIZED : mask_state::UNALLOCATED, stream, mr);
   auto null_mask_data = static_cast<bitmask_type*>(null_mask.data());
   auto const null_count =
     has_nulls ? cudf::detail::concatenate_masks(columns, null_mask_data, stream) : size_type{0};
diff --git a/cpp/src/lists/copying/segmented_gather.cu b/cpp/src/lists/copying/segmented_gather.cu
index 79d33e7c17d..855ceadf33f 100644
--- a/cpp/src/lists/copying/segmented_gather.cu
+++ b/cpp/src/lists/copying/segmented_gather.cu
@@ -116,11 +116,11 @@ std::unique_ptr<column> segmented_gather(lists_column_view const& value_column,
 std::unique_ptr<column> segmented_gather(lists_column_view const& source_column,
                                          lists_column_view const& gather_map_list,
                                          out_of_bounds_policy bounds_policy,
+                                         rmm::cuda_stream_view stream,
                                          rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::segmented_gather(
-    source_column, gather_map_list, bounds_policy, cudf::get_default_stream(), mr);
+  return detail::segmented_gather(source_column, gather_map_list, bounds_policy, stream, mr);
 }
 
 }  // namespace lists
diff --git a/cpp/src/lists/count_elements.cu b/cpp/src/lists/count_elements.cu
index 40a14d805e1..2fd0851067a 100644
--- a/cpp/src/lists/count_elements.cu
+++ b/cpp/src/lists/count_elements.cu
@@ -73,10 +73,11 @@ std::unique_ptr<column> count_elements(lists_column_view const& input,
 // external APIS
 
 std::unique_ptr<column> count_elements(lists_column_view const& input,
+                                       rmm::cuda_stream_view stream,
                                        rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::count_elements(input, cudf::get_default_stream(), mr);
+  return detail::count_elements(input, stream, mr);
 }
 
 }  // namespace lists
diff --git a/cpp/src/lists/extract.cu b/cpp/src/lists/extract.cu
index 5d4a20d1cb8..365e9ef8255 100644
--- a/cpp/src/lists/extract.cu
+++ b/cpp/src/lists/extract.cu
@@ -196,10 +196,11 @@ std::unique_ptr<column> extract_list_element(lists_column_view lists_column,
  */
 std::unique_ptr<column> extract_list_element(lists_column_view const& lists_column,
                                              size_type index,
+                                             rmm::cuda_stream_view stream,
                                              rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::extract_list_element(lists_column, index, cudf::get_default_stream(), mr);
+  return detail::extract_list_element(lists_column, index, stream, mr);
 }
 
 /**
@@ -209,12 +210,13 @@ std::unique_ptr<column> extract_list_element(lists_column_view const& lists_colu
  */
 std::unique_ptr<column> extract_list_element(lists_column_view const& lists_column,
                                              column_view const& indices,
+                                             rmm::cuda_stream_view stream,
                                              rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(indices.size() == lists_column.size(),
                "Index column must have as many elements as lists column.");
-  return detail::extract_list_element(lists_column, indices, cudf::get_default_stream(), mr);
+  return detail::extract_list_element(lists_column, indices, stream, mr);
 }
 
 }  // namespace lists
diff --git a/cpp/src/lists/reverse.cu b/cpp/src/lists/reverse.cu
index a2af85b5dad..6c00f8b64b4 100644
--- a/cpp/src/lists/reverse.cu
+++ b/cpp/src/lists/reverse.cu
@@ -86,10 +86,12 @@ std::unique_ptr<column> reverse(lists_column_view const& input,
 
 }  // namespace detail
 
-std::unique_ptr<column> reverse(lists_column_view const& input, rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> reverse(lists_column_view const& input,
+                                rmm::cuda_stream_view stream,
+                                rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::reverse(input, cudf::get_default_stream(), mr);
+  return detail::reverse(input, stream, mr);
 }
 
 }  // namespace cudf::lists
diff --git a/cpp/src/lists/segmented_sort.cu b/cpp/src/lists/segmented_sort.cu
index 49054ebb046..0b70773f4b2 100644
--- a/cpp/src/lists/segmented_sort.cu
+++ b/cpp/src/lists/segmented_sort.cu
@@ -119,20 +119,21 @@ std::unique_ptr<column> stable_sort_lists(lists_column_view const& input,
 std::unique_ptr<column> sort_lists(lists_column_view const& input,
                                    order column_order,
                                    null_order null_precedence,
+                                   rmm::cuda_stream_view stream,
                                    rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::sort_lists(input, column_order, null_precedence, cudf::get_default_stream(), mr);
+  return detail::sort_lists(input, column_order, null_precedence, stream, mr);
 }
 
 std::unique_ptr<column> stable_sort_lists(lists_column_view const& input,
                                           order column_order,
                                           null_order null_precedence,
+                                          rmm::cuda_stream_view stream,
                                           rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::stable_sort_lists(
-    input, column_order, null_precedence, cudf::get_default_stream(), mr);
+  return detail::stable_sort_lists(input, column_order, null_precedence, stream, mr);
 }
 
 }  // namespace lists
diff --git a/cpp/src/lists/sequences.cu b/cpp/src/lists/sequences.cu
index aaee5608cc3..f92ba782da7 100644
--- a/cpp/src/lists/sequences.cu
+++ b/cpp/src/lists/sequences.cu
@@ -208,19 +208,21 @@ std::unique_ptr<column> sequences(column_view const& starts,
 
 std::unique_ptr<column> sequences(column_view const& starts,
                                   column_view const& sizes,
+                                  rmm::cuda_stream_view stream,
                                   rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::sequences(starts, sizes, cudf::get_default_stream(), mr);
+  return detail::sequences(starts, sizes, stream, mr);
 }
 
 std::unique_ptr<column> sequences(column_view const& starts,
                                   column_view const& steps,
                                   column_view const& sizes,
+                                  rmm::cuda_stream_view stream,
                                   rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::sequences(starts, steps, sizes, cudf::get_default_stream(), mr);
+  return detail::sequences(starts, steps, sizes, stream, mr);
 }
 
 }  // namespace cudf::lists
diff --git a/cpp/src/lists/set_operations.cu b/cpp/src/lists/set_operations.cu
index 5687a491363..5647b503cf7 100644
--- a/cpp/src/lists/set_operations.cu
+++ b/cpp/src/lists/set_operations.cu
@@ -278,42 +278,44 @@ std::unique_ptr<column> have_overlap(lists_column_view const& lhs,
                                      lists_column_view const& rhs,
                                      null_equality nulls_equal,
                                      nan_equality nans_equal,
+                                     rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::have_overlap(lhs, rhs, nulls_equal, nans_equal, cudf::get_default_stream(), mr);
+  return detail::have_overlap(lhs, rhs, nulls_equal, nans_equal, stream, mr);
 }
 
 std::unique_ptr<column> intersect_distinct(lists_column_view const& lhs,
                                            lists_column_view const& rhs,
                                            null_equality nulls_equal,
                                            nan_equality nans_equal,
+                                           rmm::cuda_stream_view stream,
                                            rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::intersect_distinct(
-    lhs, rhs, nulls_equal, nans_equal, cudf::get_default_stream(), mr);
+  return detail::intersect_distinct(lhs, rhs, nulls_equal, nans_equal, stream, mr);
 }
 
 std::unique_ptr<column> union_distinct(lists_column_view const& lhs,
                                        lists_column_view const& rhs,
                                        null_equality nulls_equal,
                                        nan_equality nans_equal,
+                                       rmm::cuda_stream_view stream,
                                        rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::union_distinct(lhs, rhs, nulls_equal, nans_equal, cudf::get_default_stream(), mr);
+  return detail::union_distinct(lhs, rhs, nulls_equal, nans_equal, stream, mr);
 }
 
 std::unique_ptr<column> difference_distinct(lists_column_view const& lhs,
                                             lists_column_view const& rhs,
                                             null_equality nulls_equal,
                                             nan_equality nans_equal,
+                                            rmm::cuda_stream_view stream,
                                             rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::difference_distinct(
-    lhs, rhs, nulls_equal, nans_equal, cudf::get_default_stream(), mr);
+  return detail::difference_distinct(lhs, rhs, nulls_equal, nans_equal, stream, mr);
 }
 
 }  // namespace cudf::lists
diff --git a/cpp/src/lists/stream_compaction/apply_boolean_mask.cu b/cpp/src/lists/stream_compaction/apply_boolean_mask.cu
index ad43fbd5b00..ce972d89150 100644
--- a/cpp/src/lists/stream_compaction/apply_boolean_mask.cu
+++ b/cpp/src/lists/stream_compaction/apply_boolean_mask.cu
@@ -101,10 +101,11 @@ std::unique_ptr<column> apply_boolean_mask(lists_column_view const& input,
 
 std::unique_ptr<column> apply_boolean_mask(lists_column_view const& input,
                                            lists_column_view const& boolean_mask,
+                                           rmm::cuda_stream_view stream,
                                            rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::apply_boolean_mask(input, boolean_mask, cudf::get_default_stream(), mr);
+  return detail::apply_boolean_mask(input, boolean_mask, stream, mr);
 }
 
 }  // namespace cudf::lists
diff --git a/cpp/src/lists/stream_compaction/distinct.cu b/cpp/src/lists/stream_compaction/distinct.cu
index 48d8babb4fa..eb21787b3fa 100644
--- a/cpp/src/lists/stream_compaction/distinct.cu
+++ b/cpp/src/lists/stream_compaction/distinct.cu
@@ -76,10 +76,11 @@ std::unique_ptr<column> distinct(lists_column_view const& input,
 std::unique_ptr<column> distinct(lists_column_view const& input,
                                  null_equality nulls_equal,
                                  nan_equality nans_equal,
+                                 rmm::cuda_stream_view stream,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::distinct(input, nulls_equal, nans_equal, cudf::get_default_stream(), mr);
+  return detail::distinct(input, nulls_equal, nans_equal, stream, mr);
 }
 
 }  // namespace cudf::lists
diff --git a/cpp/src/merge/merge.cu b/cpp/src/merge/merge.cu
index c0765b48205..ee29c207cf1 100644
--- a/cpp/src/merge/merge.cu
+++ b/cpp/src/merge/merge.cu
@@ -13,30 +13,40 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
 #include <cudf/copying.hpp>
 #include <cudf/detail/copy.hpp>
+#include <cudf/detail/gather.cuh>
 #include <cudf/detail/iterator.cuh>
-#include <cudf/detail/merge.cuh>
+#include <cudf/detail/merge.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/search.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/dictionary/detail/merge.hpp>
 #include <cudf/dictionary/detail/update_keys.hpp>
+#include <cudf/lists/detail/concatenate.hpp>
+#include <cudf/lists/lists_column_view.hpp>
 #include <cudf/strings/detail/merge.cuh>
 #include <cudf/structs/structs_column_view.hpp>
+#include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/traits.hpp>
+#include <cudf/utilities/type_dispatcher.hpp>
 
+#include <limits>
+#include <numeric>
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <thrust/binary_search.h>
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/merge.h>
 #include <thrust/pair.h>
+#include <thrust/sequence.h>
 #include <thrust/transform.h>
 #include <thrust/tuple.h>
 
@@ -45,8 +55,47 @@
 
 namespace cudf {
 namespace detail {
+
 namespace {
 
+template <bool has_nulls>
+struct row_lexicographic_tagged_comparator {
+  row_lexicographic_tagged_comparator(table_device_view const lhs,
+                                      table_device_view const rhs,
+                                      device_span<order const> const column_order,
+                                      device_span<null_order const> const null_precedence)
+    : _lhs{lhs}, _rhs{rhs}, _column_order{column_order}, _null_precedence{null_precedence}
+  {
+  }
+
+  __device__ bool operator()(index_type lhs_tagged_index,
+                             index_type rhs_tagged_index) const noexcept
+  {
+    auto const [l_side, l_indx] = lhs_tagged_index;
+    auto const [r_side, r_indx] = rhs_tagged_index;
+
+    table_device_view const* ptr_left_dview{l_side == side::LEFT ? &_lhs : &_rhs};
+    table_device_view const* ptr_right_dview{r_side == side::LEFT ? &_lhs : &_rhs};
+    auto const comparator = [&]() {
+      if constexpr (has_nulls) {
+        return cudf::experimental::row::lexicographic::device_row_comparator<false, bool>{
+          has_nulls, *ptr_left_dview, *ptr_right_dview, _column_order, _null_precedence};
+      } else {
+        return cudf::experimental::row::lexicographic::device_row_comparator<false, bool>{
+          has_nulls, *ptr_left_dview, *ptr_right_dview, _column_order};
+      }
+    }();
+
+    return comparator(l_indx, r_indx) == weak_ordering::LESS;
+  }
+
+ private:
+  table_device_view const _lhs;
+  table_device_view const _rhs;
+  device_span<null_order const> const _null_precedence;
+  device_span<order const> const _column_order;
+};
+
 using detail::side;
 using index_type = detail::index_type;
 
@@ -187,18 +236,31 @@ index_vector generate_merged_indices(table_view const& left_table,
 
   index_vector merged_indices(total_size, stream);
 
+  auto const has_nulls =
+    nullate::DYNAMIC{cudf::has_nulls(left_table) or cudf::has_nulls(right_table)};
+
   auto lhs_device_view = table_device_view::create(left_table, stream);
   auto rhs_device_view = table_device_view::create(right_table, stream);
 
   auto d_column_order = cudf::detail::make_device_uvector_async(
     column_order, stream, rmm::mr::get_current_device_resource());
 
-  if (nullable) {
+  if (has_nulls) {
+    auto const new_null_precedence = [&]() {
+      if (null_precedence.size() > 0) {
+        CUDF_EXPECTS(static_cast<size_type>(null_precedence.size()) == left_table.num_columns(),
+                     "Null precedence vector size mismatched");
+        return null_precedence;
+      } else {
+        return std::vector<null_order>(left_table.num_columns(), null_order::BEFORE);
+      }
+    }();
+
     auto d_null_precedence = cudf::detail::make_device_uvector_async(
-      null_precedence, stream, rmm::mr::get_current_device_resource());
+      new_null_precedence, stream, rmm::mr::get_current_device_resource());
 
     auto ineq_op = detail::row_lexicographic_tagged_comparator<true>(
-      *lhs_device_view, *rhs_device_view, d_column_order.data(), d_null_precedence.data());
+      *lhs_device_view, *rhs_device_view, d_column_order, d_null_precedence);
     thrust::merge(rmm::exec_policy(stream),
                   left_begin,
                   left_begin + left_size,
@@ -208,7 +270,7 @@ index_vector generate_merged_indices(table_view const& left_table,
                   ineq_op);
   } else {
     auto ineq_op = detail::row_lexicographic_tagged_comparator<false>(
-      *lhs_device_view, *rhs_device_view, d_column_order.data());
+      *lhs_device_view, *rhs_device_view, d_column_order, {});
     thrust::merge(rmm::exec_policy(stream),
                   left_begin,
                   left_begin + left_size,
@@ -223,6 +285,56 @@ index_vector generate_merged_indices(table_view const& left_table,
   return merged_indices;
 }
 
+index_vector generate_merged_indices_nested(table_view const& left_table,
+                                            table_view const& right_table,
+                                            std::vector<order> const& column_order,
+                                            std::vector<null_order> const& null_precedence,
+                                            bool nullable,
+                                            rmm::cuda_stream_view stream)
+{
+  size_type const left_size  = left_table.num_rows();
+  size_type const right_size = right_table.num_rows();
+  size_type const total_size = left_size + right_size;
+
+  index_vector merged_indices(total_size, stream);
+
+  auto const left_indices_col     = cudf::detail::lower_bound(right_table,
+                                                          left_table,
+                                                          column_order,
+                                                          null_precedence,
+                                                          stream,
+                                                          rmm::mr::get_current_device_resource());
+  auto const left_indices         = left_indices_col->view();
+  auto left_indices_mutable       = left_indices_col->mutable_view();
+  auto const left_indices_begin   = left_indices.begin<cudf::size_type>();
+  auto const left_indices_end     = left_indices.end<cudf::size_type>();
+  auto left_indices_mutable_begin = left_indices_mutable.begin<cudf::size_type>();
+
+  auto const total_counter = thrust::make_counting_iterator(0);
+  thrust::for_each(
+    rmm::exec_policy_nosync(stream),
+    total_counter,
+    total_counter + total_size,
+    [merged = merged_indices.data(), left = left_indices_begin, left_size, right_size] __device__(
+      auto const idx) {
+      // We split threads into two groups, so only one kernel is needed.
+      // Threads in [0, right_size) will insert right indices in sorted order.
+      // Threads in [right_size, total_size) will insert left indices in sorted order.
+      if (idx < right_size) {
+        // this tells us between which segments of left elements a right element
+        // would fall
+        auto const r_bound      = thrust::upper_bound(thrust::seq, left, left + left_size, idx);
+        auto const r_segment    = thrust::distance(left, r_bound);
+        merged[r_segment + idx] = thrust::make_pair(side::RIGHT, idx);
+      } else {
+        auto const left_idx               = idx - right_size;
+        merged[left[left_idx] + left_idx] = thrust::make_pair(side::LEFT, left_idx);
+      }
+    });
+
+  return merged_indices;
+}
+
 /**
  * @brief Generate merged column given row-order of merged tables
  *  (ordered according to indices of key_cols) and the 2 columns to merge.
@@ -353,6 +465,32 @@ std::unique_ptr<column> column_merger::operator()<cudf::dictionary32>(
   return result;
 }
 
+// specialization for lists
+template <>
+std::unique_ptr<column> column_merger::operator()<cudf::list_view>(
+  column_view const& lcol,
+  column_view const& rcol,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr) const
+{
+  std::vector<column_view> columns{lcol, rcol};
+  auto concatenated_list = cudf::lists::detail::concatenate(columns, stream, mr);
+
+  auto const iter_gather = cudf::detail::make_counting_transform_iterator(
+    0, [row_order = row_order_.data(), lsize = lcol.size()] __device__(auto const idx) {
+      auto const [side, index] = row_order[idx];
+      return side == side::LEFT ? index : lsize + index;
+    });
+
+  auto result = cudf::detail::gather(table_view{{concatenated_list->view()}},
+                                     iter_gather,
+                                     iter_gather + concatenated_list->size(),
+                                     out_of_bounds_policy::DONT_CHECK,
+                                     stream,
+                                     mr);
+  return std::move(result->release()[0]);
+}
+
 // specialization for structs
 template <>
 std::unique_ptr<column> column_merger::operator()<cudf::struct_view>(
@@ -381,7 +519,7 @@ std::unique_ptr<column> column_merger::operator()<cudf::struct_view>(
   // materialize the output buffer
   rmm::device_buffer validity =
     lcol.has_nulls() || rcol.has_nulls()
-      ? create_null_mask(merged_size, mask_state::UNINITIALIZED, stream, mr)
+      ? detail::create_null_mask(merged_size, mask_state::UNINITIALIZED, stream, mr)
       : rmm::device_buffer{};
   if (lcol.has_nulls() || rcol.has_nulls()) {
     materialize_bitmask(lcol,
@@ -418,9 +556,16 @@ table_ptr_type merge(cudf::table_view const& left_table,
 
   // extract merged row order according to indices:
   //
-  auto const merged_indices = generate_merged_indices(
-    index_left_view, index_right_view, column_order, null_precedence, nullable, stream);
-
+  auto const merged_indices = [&]() {
+    if (cudf::detail::has_nested_columns(left_table) or
+        cudf::detail::has_nested_columns(right_table)) {
+      return generate_merged_indices_nested(
+        index_left_view, index_right_view, column_order, null_precedence, nullable, stream);
+    } else {
+      return generate_merged_indices(
+        index_left_view, index_right_view, column_order, null_precedence, nullable, stream);
+    }
+  }();
   // create merged table:
   //
   auto const n_cols = left_table.num_columns();
@@ -493,6 +638,14 @@ table_ptr_type merge(std::vector<table_view> const& tables_to_merge,
 
   CUDF_EXPECTS(key_cols.size() == column_order.size(),
                "Mismatched size between key_cols and column_order");
+  CUDF_EXPECTS(
+    std::accumulate(tables_to_merge.cbegin(),
+                    tables_to_merge.cend(),
+                    std::size_t{0},
+                    [](auto const& running_sum, auto const& tbl) {
+                      return running_sum + static_cast<std::size_t>(tbl.num_rows());
+                    }) <= static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max()),
+    "Total number of merged rows exceeds row limit");
 
   // This utility will ensure all corresponding dictionary columns have matching keys.
   // It will return any new dictionary columns created as well as updated table_views.
diff --git a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
index 9e8b75ae3b6..44a13c450ab 100644
--- a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
+++ b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
@@ -23,7 +23,7 @@
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/get_value.cuh>
 #include <cudf/detail/iterator.cuh>
-#include <cudf/detail/merge.cuh>
+#include <cudf/detail/merge.hpp>
 #include <cudf/detail/sorting.hpp>
 #include <cudf/detail/tdigest/tdigest.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
diff --git a/cpp/src/reductions/scan/scan_inclusive.cu b/cpp/src/reductions/scan/scan_inclusive.cu
index e74fce62caf..91aa1cac487 100644
--- a/cpp/src/reductions/scan/scan_inclusive.cu
+++ b/cpp/src/reductions/scan/scan_inclusive.cu
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-#include <reductions/nested_type_minmax_util.cuh>
 #include <reductions/scan/scan.cuh>
 
 #include <cudf/column/column_device_view.cuh>
@@ -25,9 +24,10 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/structs/utilities.hpp>
 #include <cudf/reduction.hpp>
+#include <cudf/strings/detail/scan.hpp>
+#include <cudf/structs/detail/scan.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
 #include <thrust/find.h>
@@ -68,43 +68,6 @@ std::pair<rmm::device_buffer, size_type> mask_scan(column_view const& input_view
 
 namespace {
 
-/**
- * @brief Min/Max inclusive scan operator
- *
- * This operator will accept index values, check them and then
- * run the `Op` operation on the individual element objects.
- * The returned result is the appropriate index value.
- *
- * This was specifically created to workaround a thrust issue
- * https://github.com/NVIDIA/thrust/issues/1479
- * where invalid values are passed to the operator.
- */
-template <typename Element, typename Op>
-struct min_max_scan_operator {
-  column_device_view const col;      ///< strings column device view
-  Element const null_replacement{};  ///< value used when element is null
-  bool const has_nulls;              ///< true if col has null elements
-
-  min_max_scan_operator(column_device_view const& col, bool has_nulls = true)
-    : col{col}, null_replacement{Op::template identity<Element>()}, has_nulls{has_nulls}
-  {
-    // verify validity bitmask is non-null, otherwise, is_null_nocheck() will crash
-    if (has_nulls) CUDF_EXPECTS(col.nullable(), "column with nulls must have a validity bitmask");
-  }
-
-  __device__ inline size_type operator()(size_type lhs, size_type rhs) const
-  {
-    // thrust::inclusive_scan may pass us garbage values so we need to protect ourselves;
-    // in these cases the return value does not matter since the result is not used
-    if (lhs < 0 || rhs < 0 || lhs >= col.size() || rhs >= col.size()) return 0;
-    Element d_lhs =
-      has_nulls && col.is_null_nocheck(lhs) ? null_replacement : col.element<Element>(lhs);
-    Element d_rhs =
-      has_nulls && col.is_null_nocheck(rhs) ? null_replacement : col.element<Element>(rhs);
-    return Op{}(d_lhs, d_rhs) == d_lhs ? lhs : rhs;
-  }
-};
-
 template <typename Op, typename T>
 struct scan_functor {
   static std::unique_ptr<column> invoke(column_view const& input_view,
@@ -127,11 +90,6 @@ struct scan_functor {
   }
 };
 
-struct null_iterator {
-  bitmask_type const* mask;
-  __device__ bool operator()(size_type idx) const { return !bit_is_set(mask, idx); }
-};
-
 template <typename Op>
 struct scan_functor<Op, cudf::string_view> {
   static std::unique_ptr<column> invoke(column_view const& input_view,
@@ -139,38 +97,7 @@ struct scan_functor<Op, cudf::string_view> {
                                         rmm::cuda_stream_view stream,
                                         rmm::mr::device_memory_resource* mr)
   {
-    auto d_input = column_device_view::create(input_view, stream);
-
-    // build indices of the scan operation results
-    rmm::device_uvector<size_type> result_map(input_view.size(), stream);
-    thrust::inclusive_scan(
-      rmm::exec_policy(stream),
-      thrust::counting_iterator<size_type>(0),
-      thrust::counting_iterator<size_type>(input_view.size()),
-      result_map.begin(),
-      min_max_scan_operator<cudf::string_view, Op>{*d_input, input_view.has_nulls()});
-
-    if (input_view.has_nulls()) {
-      // fill the null rows with out-of-bounds values so gather records them as null;
-      // this prevents un-sanitized null entries in the output
-      auto null_itr = detail::make_counting_transform_iterator(0, null_iterator{mask});
-      auto oob_val  = thrust::constant_iterator<size_type>(input_view.size());
-      thrust::scatter_if(rmm::exec_policy(stream),
-                         oob_val,
-                         oob_val + input_view.size(),
-                         thrust::counting_iterator<size_type>(0),
-                         null_itr,
-                         result_map.data());
-    }
-
-    // call gather using the indices to build the output column
-    auto result_table = cudf::detail::gather(cudf::table_view({input_view}),
-                                             result_map,
-                                             out_of_bounds_policy::NULLIFY,
-                                             negative_index_policy::NOT_ALLOWED,
-                                             stream,
-                                             mr);
-    return std::move(result_table->release().front());
+    return cudf::strings::detail::scan_inclusive<Op>(input_view, mask, stream, mr);
   }
 };
 
@@ -181,38 +108,7 @@ struct scan_functor<Op, cudf::struct_view> {
                                         rmm::cuda_stream_view stream,
                                         rmm::mr::device_memory_resource* mr)
   {
-    // Create a gather map containing indices of the prefix min/max elements.
-    auto gather_map = rmm::device_uvector<size_type>(input.size(), stream);
-    auto const binop_generator =
-      cudf::reduction::detail::comparison_binop_generator::create<Op>(input, stream);
-    thrust::inclusive_scan(rmm::exec_policy(stream),
-                           thrust::counting_iterator<size_type>(0),
-                           thrust::counting_iterator<size_type>(input.size()),
-                           gather_map.begin(),
-                           binop_generator.binop());
-
-    // Gather the children columns of the input column. Must use `get_sliced_child` to properly
-    // handle input in case it is a sliced view.
-    auto const input_children = [&] {
-      auto const it = cudf::detail::make_counting_transform_iterator(
-        0, [structs_view = structs_column_view{input}, &stream](auto const child_idx) {
-          return structs_view.get_sliced_child(child_idx, stream);
-        });
-      return std::vector<column_view>(it, it + input.num_children());
-    }();
-
-    // Gather the children elements of the prefix min/max struct elements for the output.
-    auto scanned_children = cudf::detail::gather(table_view{input_children},
-                                                 gather_map,
-                                                 out_of_bounds_policy::DONT_CHECK,
-                                                 negative_index_policy::NOT_ALLOWED,
-                                                 stream,
-                                                 mr)
-                              ->release();
-
-    // Don't need to set a null mask because that will be handled at the caller.
-    return make_structs_column(
-      input.size(), std::move(scanned_children), 0, rmm::device_buffer{0, stream, mr}, stream, mr);
+    return cudf::structs::detail::scan_inclusive<Op>(input, stream, mr);
   }
 };
 
diff --git a/cpp/src/round/round.cu b/cpp/src/round/round.cu
index 41cce57d55b..8a6367a1f87 100644
--- a/cpp/src/round/round.cu
+++ b/cpp/src/round/round.cu
@@ -219,8 +219,12 @@ std::unique_ptr<column> round_with(column_view const& input,
   if (decimal_places >= 0 && std::is_integral_v<T>)
     return std::make_unique<cudf::column>(input, stream, mr);
 
-  auto result = cudf::make_fixed_width_column(
-    input.type(), input.size(), copy_bitmask(input, stream, mr), input.null_count(), stream, mr);
+  auto result = cudf::make_fixed_width_column(input.type(),
+                                              input.size(),
+                                              detail::copy_bitmask(input, stream, mr),
+                                              input.null_count(),
+                                              stream,
+                                              mr);
 
   auto out_view = result->mutable_view();
   T const n     = std::pow(10, std::abs(decimal_places));
@@ -256,8 +260,12 @@ std::unique_ptr<column> round_with(column_view const& input,
   if (input.type().scale() > -decimal_places)
     return cudf::detail::cast(input, result_type, stream, mr);
 
-  auto result = cudf::make_fixed_width_column(
-    result_type, input.size(), copy_bitmask(input, stream, mr), input.null_count(), stream, mr);
+  auto result = cudf::make_fixed_width_column(result_type,
+                                              input.size(),
+                                              detail::copy_bitmask(input, stream, mr),
+                                              input.null_count(),
+                                              stream,
+                                              mr);
 
   auto out_view = result->mutable_view();
 
diff --git a/cpp/src/search/contains_column.cu b/cpp/src/search/contains_column.cu
index 4363bd212fe..b8c7d058535 100644
--- a/cpp/src/search/contains_column.cu
+++ b/cpp/src/search/contains_column.cu
@@ -14,23 +14,14 @@
  * limitations under the License.
  */
 
-#include <hash/unordered_multiset.cuh>
-
-#include <cudf/column/column_factories.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/search.hpp>
 #include <cudf/dictionary/detail/search.hpp>
 #include <cudf/dictionary/detail/update_keys.hpp>
 #include <cudf/table/table_view.hpp>
-#include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/exec_policy.hpp>
-
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/transform.h>
-#include <thrust/uninitialized_fill.h>
 
 namespace cudf {
 namespace detail {
@@ -38,61 +29,7 @@ namespace detail {
 namespace {
 
 struct contains_column_dispatch {
-  template <typename Element, typename Haystack>
-  struct contains_fn {
-    bool __device__ operator()(size_type const idx) const
-    {
-      if (needles_have_nulls && needles.is_null_nocheck(idx)) {
-        // Exit early. The value doesn't matter, and will be masked as a null element.
-        return true;
-      }
-
-      return haystack.contains(needles.template element<Element>(idx));
-    }
-
-    Haystack const haystack;
-    column_device_view const needles;
-    bool const needles_have_nulls;
-  };
-
-  template <typename Element, CUDF_ENABLE_IF(!is_nested<Element>())>
-  std::unique_ptr<column> operator()(column_view const& haystack,
-                                     column_view const& needles,
-                                     rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
-  {
-    auto result = make_numeric_column(data_type{type_to_id<bool>()},
-                                      needles.size(),
-                                      copy_bitmask(needles, stream, mr),
-                                      needles.null_count(),
-                                      stream,
-                                      mr);
-    if (needles.is_empty()) { return result; }
-
-    auto const out_begin = result->mutable_view().template begin<bool>();
-    if (haystack.is_empty()) {
-      thrust::uninitialized_fill(
-        rmm::exec_policy(stream), out_begin, out_begin + needles.size(), false);
-      return result;
-    }
-
-    auto const haystack_set = cudf::detail::unordered_multiset<Element>::create(haystack, stream);
-    auto const haystack_set_dv = haystack_set.to_device();
-    auto const needles_cdv_ptr = column_device_view::create(needles, stream);
-
-    thrust::transform(rmm::exec_policy(stream),
-                      thrust::make_counting_iterator<size_type>(0),
-                      thrust::make_counting_iterator<size_type>(needles.size()),
-                      out_begin,
-                      contains_fn<Element, decltype(haystack_set_dv)>{
-                        haystack_set_dv, *needles_cdv_ptr, needles.has_nulls()});
-
-    result->set_null_count(needles.null_count());
-
-    return result;
-  }
-
-  template <typename Element, CUDF_ENABLE_IF(is_nested<Element>())>
+  template <typename Element>
   std::unique_ptr<column> operator()(column_view const& haystack,
                                      column_view const& needles,
                                      rmm::cuda_stream_view stream,
@@ -105,7 +42,7 @@ struct contains_column_dispatch {
                                      stream,
                                      mr);
     return std::make_unique<column>(
-      std::move(result_v), copy_bitmask(needles, stream, mr), needles.null_count());
+      std::move(result_v), detail::copy_bitmask(needles, stream, mr), needles.null_count());
   }
 };
 
@@ -144,8 +81,6 @@ std::unique_ptr<column> contains(column_view const& haystack,
                                  rmm::cuda_stream_view stream,
                                  rmm::mr::device_memory_resource* mr)
 {
-  CUDF_EXPECTS(haystack.type() == needles.type(), "DTYPE mismatch");
-
   return cudf::type_dispatcher(
     haystack.type(), contains_column_dispatch{}, haystack, needles, stream, mr);
 }
diff --git a/cpp/src/strings/char_types/char_types.cu b/cpp/src/strings/char_types/char_types.cu
index 0c0ad0ad29e..35b0c0a2690 100644
--- a/cpp/src/strings/char_types/char_types.cu
+++ b/cpp/src/strings/char_types/char_types.cu
@@ -214,25 +214,26 @@ std::unique_ptr<column> filter_characters_of_type(strings_column_view const& str
 
 // external API
 
-std::unique_ptr<column> all_characters_of_type(strings_column_view const& strings,
+std::unique_ptr<column> all_characters_of_type(strings_column_view const& input,
                                                string_character_types types,
                                                string_character_types verify_types,
+                                               rmm::cuda_stream_view stream,
                                                rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::all_characters_of_type(
-    strings, types, verify_types, cudf::get_default_stream(), mr);
+  return detail::all_characters_of_type(input, types, verify_types, stream, mr);
 }
 
-std::unique_ptr<column> filter_characters_of_type(strings_column_view const& strings,
+std::unique_ptr<column> filter_characters_of_type(strings_column_view const& input,
                                                   string_character_types types_to_remove,
                                                   string_scalar const& replacement,
                                                   string_character_types types_to_keep,
+                                                  rmm::cuda_stream_view stream,
                                                   rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
   return detail::filter_characters_of_type(
-    strings, types_to_remove, replacement, types_to_keep, cudf::get_default_stream(), mr);
+    input, types_to_remove, replacement, types_to_keep, stream, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/combine/concatenate.cu b/cpp/src/strings/combine/concatenate.cu
index ba8acd23467..0a11b6dc460 100644
--- a/cpp/src/strings/combine/concatenate.cu
+++ b/cpp/src/strings/combine/concatenate.cu
@@ -267,11 +267,11 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
                                     string_scalar const& separator,
                                     string_scalar const& narep,
                                     separator_on_nulls separate_nulls,
+                                    rmm::cuda_stream_view stream,
                                     rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::concatenate(
-    strings_columns, separator, narep, separate_nulls, cudf::get_default_stream(), mr);
+  return detail::concatenate(strings_columns, separator, narep, separate_nulls, stream, mr);
 }
 
 std::unique_ptr<column> concatenate(table_view const& strings_columns,
@@ -279,16 +279,12 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
                                     string_scalar const& separator_narep,
                                     string_scalar const& col_narep,
                                     separator_on_nulls separate_nulls,
+                                    rmm::cuda_stream_view stream,
                                     rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::concatenate(strings_columns,
-                             separators,
-                             separator_narep,
-                             col_narep,
-                             separate_nulls,
-                             cudf::get_default_stream(),
-                             mr);
+  return detail::concatenate(
+    strings_columns, separators, separator_narep, col_narep, separate_nulls, stream, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/combine/join.cu b/cpp/src/strings/combine/join.cu
index faf1be6a26f..9ab527feaf8 100644
--- a/cpp/src/strings/combine/join.cu
+++ b/cpp/src/strings/combine/join.cu
@@ -180,10 +180,11 @@ std::unique_ptr<column> join_strings(strings_column_view const& input,
 std::unique_ptr<column> join_strings(strings_column_view const& strings,
                                      string_scalar const& separator,
                                      string_scalar const& narep,
+                                     rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::join_strings(strings, separator, narep, cudf::get_default_stream(), mr);
+  return detail::join_strings(strings, separator, narep, stream, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/combine/join_list_elements.cu b/cpp/src/strings/combine/join_list_elements.cu
index eee59e37478..372b49fb0ee 100644
--- a/cpp/src/strings/combine/join_list_elements.cu
+++ b/cpp/src/strings/combine/join_list_elements.cu
@@ -301,16 +301,12 @@ std::unique_ptr<column> join_list_elements(lists_column_view const& lists_string
                                            string_scalar const& narep,
                                            separator_on_nulls separate_nulls,
                                            output_if_empty_list empty_list_policy,
+                                           rmm::cuda_stream_view stream,
                                            rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::join_list_elements(lists_strings_column,
-                                    separator,
-                                    narep,
-                                    separate_nulls,
-                                    empty_list_policy,
-                                    cudf::get_default_stream(),
-                                    mr);
+  return detail::join_list_elements(
+    lists_strings_column, separator, narep, separate_nulls, empty_list_policy, stream, mr);
 }
 
 std::unique_ptr<column> join_list_elements(lists_column_view const& lists_strings_column,
@@ -319,6 +315,7 @@ std::unique_ptr<column> join_list_elements(lists_column_view const& lists_string
                                            string_scalar const& string_narep,
                                            separator_on_nulls separate_nulls,
                                            output_if_empty_list empty_list_policy,
+                                           rmm::cuda_stream_view stream,
                                            rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
@@ -328,7 +325,7 @@ std::unique_ptr<column> join_list_elements(lists_column_view const& lists_string
                                     string_narep,
                                     separate_nulls,
                                     empty_list_policy,
-                                    cudf::get_default_stream(),
+                                    stream,
                                     mr);
 }
 
diff --git a/cpp/src/strings/contains.cu b/cpp/src/strings/contains.cu
index 22534870409..4383f358a33 100644
--- a/cpp/src/strings/contains.cu
+++ b/cpp/src/strings/contains.cu
@@ -123,28 +123,31 @@ std::unique_ptr<column> count_re(strings_column_view const& input,
 
 // external APIs
 
-std::unique_ptr<column> contains_re(strings_column_view const& strings,
+std::unique_ptr<column> contains_re(strings_column_view const& input,
                                     regex_program const& prog,
+                                    rmm::cuda_stream_view stream,
                                     rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::contains_re(strings, prog, cudf::get_default_stream(), mr);
+  return detail::contains_re(input, prog, stream, mr);
 }
 
-std::unique_ptr<column> matches_re(strings_column_view const& strings,
+std::unique_ptr<column> matches_re(strings_column_view const& input,
                                    regex_program const& prog,
+                                   rmm::cuda_stream_view stream,
                                    rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::matches_re(strings, prog, cudf::get_default_stream(), mr);
+  return detail::matches_re(input, prog, stream, mr);
 }
 
-std::unique_ptr<column> count_re(strings_column_view const& strings,
+std::unique_ptr<column> count_re(strings_column_view const& input,
                                  regex_program const& prog,
+                                 rmm::cuda_stream_view stream,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::count_re(strings, prog, cudf::get_default_stream(), mr);
+  return detail::count_re(input, prog, stream, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/convert/convert_booleans.cu b/cpp/src/strings/convert/convert_booleans.cu
index 0d04fc74b0c..8196e1d90fb 100644
--- a/cpp/src/strings/convert/convert_booleans.cu
+++ b/cpp/src/strings/convert/convert_booleans.cu
@@ -39,25 +39,25 @@ namespace cudf {
 namespace strings {
 namespace detail {
 // Convert strings column to boolean column
-std::unique_ptr<column> to_booleans(strings_column_view const& strings,
+std::unique_ptr<column> to_booleans(strings_column_view const& input,
                                     string_scalar const& true_string,
                                     rmm::cuda_stream_view stream,
                                     rmm::mr::device_memory_resource* mr)
 {
-  size_type strings_count = strings.size();
+  size_type strings_count = input.size();
   if (strings_count == 0) return make_numeric_column(data_type{type_id::BOOL8}, 0);
 
   CUDF_EXPECTS(true_string.is_valid(stream) && true_string.size() > 0,
                "Parameter true_string must not be empty.");
   auto d_true = string_view(true_string.data(), true_string.size());
 
-  auto strings_column = column_device_view::create(strings.parent(), stream);
+  auto strings_column = column_device_view::create(input.parent(), stream);
   auto d_strings      = *strings_column;
   // create output column copying the strings' null-mask
   auto results      = make_numeric_column(data_type{type_id::BOOL8},
                                      strings_count,
-                                     cudf::detail::copy_bitmask(strings.parent(), stream, mr),
-                                     strings.null_count(),
+                                     cudf::detail::copy_bitmask(input.parent(), stream, mr),
+                                     input.null_count(),
                                      stream,
                                      mr);
   auto results_view = results->mutable_view();
@@ -73,19 +73,20 @@ std::unique_ptr<column> to_booleans(strings_column_view const& strings,
                         result = d_strings.element<string_view>(idx).compare(d_true) == 0;
                       return result;
                     });
-  results->set_null_count(strings.null_count());
+  results->set_null_count(input.null_count());
   return results;
 }
 
 }  // namespace detail
 
 // external API
-std::unique_ptr<column> to_booleans(strings_column_view const& strings,
+std::unique_ptr<column> to_booleans(strings_column_view const& input,
                                     string_scalar const& true_string,
+                                    rmm::cuda_stream_view stream,
                                     rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::to_booleans(strings, true_string, cudf::get_default_stream(), mr);
+  return detail::to_booleans(input, true_string, stream, mr);
 }
 
 namespace detail {
@@ -156,10 +157,11 @@ std::unique_ptr<column> from_booleans(column_view const& booleans,
 std::unique_ptr<column> from_booleans(column_view const& booleans,
                                       string_scalar const& true_string,
                                       string_scalar const& false_string,
+                                      rmm::cuda_stream_view stream,
                                       rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::from_booleans(booleans, true_string, false_string, cudf::get_default_stream(), mr);
+  return detail::from_booleans(booleans, true_string, false_string, stream, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/convert/convert_datetime.cu b/cpp/src/strings/convert/convert_datetime.cu
index 8a953d778ed..d2609441d72 100644
--- a/cpp/src/strings/convert/convert_datetime.cu
+++ b/cpp/src/strings/convert/convert_datetime.cu
@@ -710,18 +710,20 @@ std::unique_ptr<cudf::column> is_timestamp(strings_column_view const& input,
 std::unique_ptr<cudf::column> to_timestamps(strings_column_view const& input,
                                             data_type timestamp_type,
                                             std::string_view format,
+                                            rmm::cuda_stream_view stream,
                                             rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::to_timestamps(input, timestamp_type, format, cudf::get_default_stream(), mr);
+  return detail::to_timestamps(input, timestamp_type, format, stream, mr);
 }
 
 std::unique_ptr<cudf::column> is_timestamp(strings_column_view const& input,
                                            std::string_view format,
+                                           rmm::cuda_stream_view stream,
                                            rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::is_timestamp(input, format, cudf::get_default_stream(), mr);
+  return detail::is_timestamp(input, format, stream, mr);
 }
 
 namespace detail {
@@ -1168,10 +1170,11 @@ std::unique_ptr<column> from_timestamps(column_view const& timestamps,
 std::unique_ptr<column> from_timestamps(column_view const& timestamps,
                                         std::string_view format,
                                         strings_column_view const& names,
+                                        rmm::cuda_stream_view stream,
                                         rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::from_timestamps(timestamps, format, names, cudf::get_default_stream(), mr);
+  return detail::from_timestamps(timestamps, format, names, stream, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/convert/convert_durations.cu b/cpp/src/strings/convert/convert_durations.cu
index 6ab70825a6b..e781581b378 100644
--- a/cpp/src/strings/convert/convert_durations.cu
+++ b/cpp/src/strings/convert/convert_durations.cu
@@ -690,30 +690,30 @@ std::unique_ptr<column> from_durations(column_view const& durations,
     durations.type(), dispatch_from_durations_fn{}, durations, format, stream, mr);
 }
 
-std::unique_ptr<column> to_durations(strings_column_view const& strings,
+std::unique_ptr<column> to_durations(strings_column_view const& input,
                                      data_type duration_type,
                                      std::string_view format,
                                      rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
 {
-  size_type strings_count = strings.size();
+  size_type strings_count = input.size();
   if (strings_count == 0) return make_duration_column(duration_type, 0);
 
   CUDF_EXPECTS(!format.empty(), "Format parameter must not be empty.");
 
-  auto strings_column = column_device_view::create(strings.parent(), stream);
+  auto strings_column = column_device_view::create(input.parent(), stream);
   auto d_column       = *strings_column;
 
   auto results      = make_duration_column(duration_type,
                                       strings_count,
-                                      cudf::detail::copy_bitmask(strings.parent(), stream, mr),
-                                      strings.null_count(),
+                                      cudf::detail::copy_bitmask(input.parent(), stream, mr),
+                                      input.null_count(),
                                       stream,
                                       mr);
   auto results_view = results->mutable_view();
   cudf::type_dispatcher(
     duration_type, dispatch_to_durations_fn(), d_column, format, results_view, stream);
-  results->set_null_count(strings.null_count());
+  results->set_null_count(input.null_count());
   return results;
 }
 
@@ -721,19 +721,21 @@ std::unique_ptr<column> to_durations(strings_column_view const& strings,
 
 std::unique_ptr<column> from_durations(column_view const& durations,
                                        std::string_view format,
+                                       rmm::cuda_stream_view stream,
                                        rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::from_durations(durations, format, cudf::get_default_stream(), mr);
+  return detail::from_durations(durations, format, stream, mr);
 }
 
-std::unique_ptr<column> to_durations(strings_column_view const& strings,
+std::unique_ptr<column> to_durations(strings_column_view const& input,
                                      data_type duration_type,
                                      std::string_view format,
+                                     rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::to_durations(strings, duration_type, format, cudf::get_default_stream(), mr);
+  return detail::to_durations(input, duration_type, format, stream, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/convert/convert_fixed_point.cu b/cpp/src/strings/convert/convert_fixed_point.cu
index 51aab9faeba..2c59f6dcd29 100644
--- a/cpp/src/strings/convert/convert_fixed_point.cu
+++ b/cpp/src/strings/convert/convert_fixed_point.cu
@@ -184,12 +184,13 @@ std::unique_ptr<column> to_fixed_point(strings_column_view const& input,
 }  // namespace detail
 
 // external API
-std::unique_ptr<column> to_fixed_point(strings_column_view const& strings,
+std::unique_ptr<column> to_fixed_point(strings_column_view const& input,
                                        data_type output_type,
+                                       rmm::cuda_stream_view stream,
                                        rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::to_fixed_point(strings, output_type, cudf::get_default_stream(), mr);
+  return detail::to_fixed_point(input, output_type, stream, mr);
 }
 
 namespace detail {
@@ -277,10 +278,11 @@ std::unique_ptr<column> from_fixed_point(column_view const& input,
 // external API
 
 std::unique_ptr<column> from_fixed_point(column_view const& input,
+                                         rmm::cuda_stream_view stream,
                                          rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::from_fixed_point(input, cudf::get_default_stream(), mr);
+  return detail::from_fixed_point(input, stream, mr);
 }
 
 namespace detail {
@@ -341,10 +343,11 @@ std::unique_ptr<column> is_fixed_point(strings_column_view const& input,
 
 std::unique_ptr<column> is_fixed_point(strings_column_view const& input,
                                        data_type decimal_type,
+                                       rmm::cuda_stream_view stream,
                                        rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::is_fixed_point(input, decimal_type, cudf::get_default_stream(), mr);
+  return detail::is_fixed_point(input, decimal_type, stream, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/convert/convert_floats.cu b/cpp/src/strings/convert/convert_floats.cu
index 32167589ab4..81d686d690c 100644
--- a/cpp/src/strings/convert/convert_floats.cu
+++ b/cpp/src/strings/convert/convert_floats.cu
@@ -91,26 +91,26 @@ struct dispatch_to_floats_fn {
 }  // namespace
 
 // This will convert a strings column into any float column type.
-std::unique_ptr<column> to_floats(strings_column_view const& strings,
+std::unique_ptr<column> to_floats(strings_column_view const& input,
                                   data_type output_type,
                                   rmm::cuda_stream_view stream,
                                   rmm::mr::device_memory_resource* mr)
 {
-  size_type strings_count = strings.size();
+  size_type strings_count = input.size();
   if (strings_count == 0) return make_numeric_column(output_type, 0);
-  auto strings_column = column_device_view::create(strings.parent(), stream);
+  auto strings_column = column_device_view::create(input.parent(), stream);
   auto d_strings      = *strings_column;
   // create float output column copying the strings null-mask
   auto results      = make_numeric_column(output_type,
                                      strings_count,
-                                     cudf::detail::copy_bitmask(strings.parent(), stream, mr),
-                                     strings.null_count(),
+                                     cudf::detail::copy_bitmask(input.parent(), stream, mr),
+                                     input.null_count(),
                                      stream,
                                      mr);
   auto results_view = results->mutable_view();
   // fill output column with floats
   type_dispatcher(output_type, dispatch_to_floats_fn{}, d_strings, results_view, stream);
-  results->set_null_count(strings.null_count());
+  results->set_null_count(input.null_count());
   return results;
 }
 
@@ -118,12 +118,13 @@ std::unique_ptr<column> to_floats(strings_column_view const& strings,
 
 // external API
 
-std::unique_ptr<column> to_floats(strings_column_view const& strings,
+std::unique_ptr<column> to_floats(strings_column_view const& input,
                                   data_type output_type,
+                                  rmm::cuda_stream_view stream,
                                   rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::to_floats(strings, output_type, cudf::get_default_stream(), mr);
+  return detail::to_floats(input, output_type, stream, mr);
 }
 
 namespace detail {
@@ -436,48 +437,51 @@ std::unique_ptr<column> from_floats(column_view const& floats,
 }  // namespace detail
 
 // external API
-std::unique_ptr<column> from_floats(column_view const& floats, rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> from_floats(column_view const& floats,
+                                    rmm::cuda_stream_view stream,
+                                    rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::from_floats(floats, cudf::get_default_stream(), mr);
+  return detail::from_floats(floats, stream, mr);
 }
 
 namespace detail {
-std::unique_ptr<column> is_float(strings_column_view const& strings,
+std::unique_ptr<column> is_float(strings_column_view const& input,
                                  rmm::cuda_stream_view stream,
                                  rmm::mr::device_memory_resource* mr)
 {
-  auto strings_column = column_device_view::create(strings.parent(), stream);
+  auto strings_column = column_device_view::create(input.parent(), stream);
   auto d_column       = *strings_column;
   // create output column
   auto results   = make_numeric_column(data_type{type_id::BOOL8},
-                                     strings.size(),
-                                     cudf::detail::copy_bitmask(strings.parent(), stream, mr),
-                                     strings.null_count(),
+                                     input.size(),
+                                     cudf::detail::copy_bitmask(input.parent(), stream, mr),
+                                     input.null_count(),
                                      stream,
                                      mr);
   auto d_results = results->mutable_view().data<bool>();
   // check strings for valid float chars
   thrust::transform(rmm::exec_policy(stream),
                     thrust::make_counting_iterator<size_type>(0),
-                    thrust::make_counting_iterator<size_type>(strings.size()),
+                    thrust::make_counting_iterator<size_type>(input.size()),
                     d_results,
                     [d_column] __device__(size_type idx) {
                       if (d_column.is_null(idx)) return false;
                       return is_float(d_column.element<string_view>(idx));
                     });
-  results->set_null_count(strings.null_count());
+  results->set_null_count(input.null_count());
   return results;
 }
 
 }  // namespace detail
 
 // external API
-std::unique_ptr<column> is_float(strings_column_view const& strings,
+std::unique_ptr<column> is_float(strings_column_view const& input,
+                                 rmm::cuda_stream_view stream,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::is_float(strings, cudf::get_default_stream(), mr);
+  return detail::is_float(input, stream, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/convert/convert_hex.cu b/cpp/src/strings/convert/convert_hex.cu
index bed682aba71..8f656b149a5 100644
--- a/cpp/src/strings/convert/convert_hex.cu
+++ b/cpp/src/strings/convert/convert_hex.cu
@@ -93,7 +93,8 @@ struct hex_to_integer_fn {
  * The output_column is expected to be one of the integer types only.
  */
 struct dispatch_hex_to_integers_fn {
-  template <typename IntegerType, std::enable_if_t<std::is_integral_v<IntegerType>>* = nullptr>
+  template <typename IntegerType,
+            std::enable_if_t<cudf::is_integral_not_bool<IntegerType>()>* = nullptr>
   void operator()(column_device_view const& strings_column,
                   mutable_column_view& output_column,
                   rmm::cuda_stream_view stream) const
@@ -105,22 +106,14 @@ struct dispatch_hex_to_integers_fn {
                       d_results,
                       hex_to_integer_fn<IntegerType>{strings_column});
   }
-  // non-integral types throw an exception
+  // non-integer types throw an exception
   template <typename T, typename... Args>
-  std::enable_if_t<not std::is_integral_v<T>, void> operator()(Args&&...) const
+  std::enable_if_t<not cudf::is_integral_not_bool<T>(), void> operator()(Args&&...) const
   {
-    CUDF_FAIL("Output for hex_to_integers must be an integral type.");
+    CUDF_FAIL("Output for hex_to_integers must be an integer type.");
   }
 };
 
-template <>
-void dispatch_hex_to_integers_fn::operator()<bool>(column_device_view const&,
-                                                   mutable_column_view&,
-                                                   rmm::cuda_stream_view) const
-{
-  CUDF_FAIL("Output for hex_to_integers must not be a boolean type.");
-}
-
 /**
  * @brief Functor to convert integers to hexadecimal strings
  *
@@ -179,7 +172,8 @@ struct integer_to_hex_fn {
 };
 
 struct dispatch_integers_to_hex_fn {
-  template <typename IntegerType, std::enable_if_t<std::is_integral_v<IntegerType>>* = nullptr>
+  template <typename IntegerType,
+            std::enable_if_t<cudf::is_integral_not_bool<IntegerType>()>* = nullptr>
   std::unique_ptr<column> operator()(column_view const& input,
                                      rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr) const
@@ -195,11 +189,12 @@ struct dispatch_integers_to_hex_fn {
                                input.null_count(),
                                cudf::detail::copy_bitmask(input, stream, mr));
   }
-  // non-integral types throw an exception
+  // non-integer types throw an exception
   template <typename T, typename... Args>
-  std::enable_if_t<not std::is_integral_v<T>, std::unique_ptr<column>> operator()(Args...) const
+  std::enable_if_t<not cudf::is_integral_not_bool<T>(), std::unique_ptr<column>> operator()(
+    Args...) const
   {
-    CUDF_FAIL("integers_to_hex only supports integral type columns");
+    CUDF_FAIL("integers_to_hex only supports integer type columns");
   }
 };
 
@@ -280,24 +275,27 @@ std::unique_ptr<column> integers_to_hex(column_view const& input,
 // external API
 std::unique_ptr<column> hex_to_integers(strings_column_view const& strings,
                                         data_type output_type,
+                                        rmm::cuda_stream_view stream,
                                         rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::hex_to_integers(strings, output_type, cudf::get_default_stream(), mr);
+  return detail::hex_to_integers(strings, output_type, stream, mr);
 }
 
 std::unique_ptr<column> is_hex(strings_column_view const& strings,
+                               rmm::cuda_stream_view stream,
                                rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::is_hex(strings, cudf::get_default_stream(), mr);
+  return detail::is_hex(strings, stream, mr);
 }
 
 std::unique_ptr<column> integers_to_hex(column_view const& input,
+                                        rmm::cuda_stream_view stream,
                                         rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::integers_to_hex(input, cudf::get_default_stream(), mr);
+  return detail::integers_to_hex(input, stream, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/convert/convert_integers.cu b/cpp/src/strings/convert/convert_integers.cu
index 5597d2831c0..4839e83d5dd 100644
--- a/cpp/src/strings/convert/convert_integers.cu
+++ b/cpp/src/strings/convert/convert_integers.cu
@@ -111,21 +111,21 @@ inline __device__ bool is_integer(string_view const& d_str)
  * @brief The dispatch functions for checking if strings are valid integers.
  */
 struct dispatch_is_integer_fn {
-  template <typename T, std::enable_if_t<std::is_integral_v<T>>* = nullptr>
-  std::unique_ptr<column> operator()(strings_column_view const& strings,
+  template <typename T, std::enable_if_t<cudf::is_integral_not_bool<T>()>* = nullptr>
+  std::unique_ptr<column> operator()(strings_column_view const& input,
                                      rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr) const
   {
-    auto const d_column = column_device_view::create(strings.parent(), stream);
+    auto const d_column = column_device_view::create(input.parent(), stream);
     auto results        = make_numeric_column(data_type{type_id::BOOL8},
-                                       strings.size(),
-                                       cudf::detail::copy_bitmask(strings.parent(), stream, mr),
-                                       strings.null_count(),
+                                       input.size(),
+                                       cudf::detail::copy_bitmask(input.parent(), stream, mr),
+                                       input.null_count(),
                                        stream,
                                        mr);
 
     auto d_results = results->mutable_view().data<bool>();
-    if (strings.has_nulls()) {
+    if (input.has_nulls()) {
       thrust::transform(rmm::exec_policy(stream),
                         d_column->pair_begin<string_view, true>(),
                         d_column->pair_end<string_view, true>(),
@@ -140,12 +140,12 @@ struct dispatch_is_integer_fn {
     }
 
     // Calling mutable_view() on a column invalidates it's null count so we need to set it back
-    results->set_null_count(strings.null_count());
+    results->set_null_count(input.null_count());
 
     return results;
   }
 
-  template <typename T, std::enable_if_t<not std::is_integral_v<T>>* = nullptr>
+  template <typename T, std::enable_if_t<not cudf::is_integral_not_bool<T>()>* = nullptr>
   std::unique_ptr<column> operator()(strings_column_view const&,
                                      rmm::cuda_stream_view,
                                      rmm::mr::device_memory_resource*) const
@@ -156,20 +156,20 @@ struct dispatch_is_integer_fn {
 
 }  // namespace
 
-std::unique_ptr<column> is_integer(strings_column_view const& strings,
+std::unique_ptr<column> is_integer(strings_column_view const& input,
                                    rmm::cuda_stream_view stream,
                                    rmm::mr::device_memory_resource* mr)
 {
-  auto const d_column = column_device_view::create(strings.parent(), stream);
+  auto const d_column = column_device_view::create(input.parent(), stream);
   auto results        = make_numeric_column(data_type{type_id::BOOL8},
-                                     strings.size(),
-                                     cudf::detail::copy_bitmask(strings.parent(), stream, mr),
-                                     strings.null_count(),
+                                     input.size(),
+                                     cudf::detail::copy_bitmask(input.parent(), stream, mr),
+                                     input.null_count(),
                                      stream,
                                      mr);
 
   auto d_results = results->mutable_view().data<bool>();
-  if (strings.has_nulls()) {
+  if (input.has_nulls()) {
     thrust::transform(
       rmm::exec_policy(stream),
       d_column->pair_begin<string_view, true>(),
@@ -185,36 +185,38 @@ std::unique_ptr<column> is_integer(strings_column_view const& strings,
   }
 
   // Calling mutable_view() on a column invalidates it's null count so we need to set it back
-  results->set_null_count(strings.null_count());
+  results->set_null_count(input.null_count());
 
   return results;
 }
 
-std::unique_ptr<column> is_integer(strings_column_view const& strings,
+std::unique_ptr<column> is_integer(strings_column_view const& input,
                                    data_type int_type,
                                    rmm::cuda_stream_view stream,
                                    rmm::mr::device_memory_resource* mr)
 {
-  if (strings.is_empty()) { return cudf::make_empty_column(type_id::BOOL8); }
-  return type_dispatcher(int_type, dispatch_is_integer_fn{}, strings, stream, mr);
+  if (input.is_empty()) { return cudf::make_empty_column(type_id::BOOL8); }
+  return type_dispatcher(int_type, dispatch_is_integer_fn{}, input, stream, mr);
 }
 
 }  // namespace detail
 
 // external APIs
-std::unique_ptr<column> is_integer(strings_column_view const& strings,
+std::unique_ptr<column> is_integer(strings_column_view const& input,
+                                   rmm::cuda_stream_view stream,
                                    rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::is_integer(strings, cudf::get_default_stream(), mr);
+  return detail::is_integer(input, stream, mr);
 }
 
-std::unique_ptr<column> is_integer(strings_column_view const& strings,
+std::unique_ptr<column> is_integer(strings_column_view const& input,
                                    data_type int_type,
+                                   rmm::cuda_stream_view stream,
                                    rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::is_integer(strings, int_type, cudf::get_default_stream(), mr);
+  return detail::is_integer(input, int_type, stream, mr);
 }
 
 namespace detail {
@@ -243,7 +245,8 @@ struct string_to_integer_fn {
  * The output_column is expected to be one of the integer types only.
  */
 struct dispatch_to_integers_fn {
-  template <typename IntegerType, std::enable_if_t<std::is_integral_v<IntegerType>>* = nullptr>
+  template <typename IntegerType,
+            std::enable_if_t<cudf::is_integral_not_bool<IntegerType>()>* = nullptr>
   void operator()(column_device_view const& strings_column,
                   mutable_column_view& output_column,
                   rmm::cuda_stream_view stream) const
@@ -254,47 +257,39 @@ struct dispatch_to_integers_fn {
                       output_column.data<IntegerType>(),
                       string_to_integer_fn<IntegerType>{strings_column});
   }
-  // non-integral types throw an exception
-  template <typename T, std::enable_if_t<not std::is_integral_v<T>>* = nullptr>
+  // non-integer types throw an exception
+  template <typename T, std::enable_if_t<not cudf::is_integral_not_bool<T>()>* = nullptr>
   void operator()(column_device_view const&, mutable_column_view&, rmm::cuda_stream_view) const
   {
-    CUDF_FAIL("Output for to_integers must be an integral type.");
+    CUDF_FAIL("Output for to_integers must be an integer type.");
   }
 };
 
-template <>
-void dispatch_to_integers_fn::operator()<bool>(column_device_view const&,
-                                               mutable_column_view&,
-                                               rmm::cuda_stream_view) const
-{
-  CUDF_FAIL("Output for to_integers must not be a boolean type.");
-}
-
 }  // namespace
 
 // This will convert a strings column into any integer column type.
-std::unique_ptr<column> to_integers(strings_column_view const& strings,
+std::unique_ptr<column> to_integers(strings_column_view const& input,
                                     data_type output_type,
                                     rmm::cuda_stream_view stream,
                                     rmm::mr::device_memory_resource* mr)
 {
-  size_type strings_count = strings.size();
+  size_type strings_count = input.size();
   if (strings_count == 0) return make_numeric_column(output_type, 0);
 
   // Create integer output column copying the strings null-mask
   auto results = make_numeric_column(output_type,
                                      strings_count,
-                                     cudf::detail::copy_bitmask(strings.parent(), stream, mr),
-                                     strings.null_count(),
+                                     cudf::detail::copy_bitmask(input.parent(), stream, mr),
+                                     input.null_count(),
                                      stream,
                                      mr);
   // Fill output column with integers
-  auto const strings_dev_view = column_device_view::create(strings.parent(), stream);
+  auto const strings_dev_view = column_device_view::create(input.parent(), stream);
   auto results_view           = results->mutable_view();
   type_dispatcher(output_type, dispatch_to_integers_fn{}, *strings_dev_view, results_view, stream);
 
   // Calling mutable_view() on a column invalidates it's null count so we need to set it back
-  results->set_null_count(strings.null_count());
+  results->set_null_count(input.null_count());
 
   return results;
 }
@@ -302,12 +297,13 @@ std::unique_ptr<column> to_integers(strings_column_view const& strings,
 }  // namespace detail
 
 // external API
-std::unique_ptr<column> to_integers(strings_column_view const& strings,
+std::unique_ptr<column> to_integers(strings_column_view const& input,
                                     data_type output_type,
+                                    rmm::cuda_stream_view stream,
                                     rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::to_integers(strings, output_type, cudf::get_default_stream(), mr);
+  return detail::to_integers(input, output_type, stream, mr);
 }
 
 namespace detail {
@@ -351,7 +347,8 @@ struct from_integers_fn {
  * The template function declaration ensures only integer types are used.
  */
 struct dispatch_from_integers_fn {
-  template <typename IntegerType, std::enable_if_t<std::is_integral_v<IntegerType>>* = nullptr>
+  template <typename IntegerType,
+            std::enable_if_t<cudf::is_integral_not_bool<IntegerType>()>* = nullptr>
   std::unique_ptr<column> operator()(column_view const& integers,
                                      rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr) const
@@ -373,23 +370,15 @@ struct dispatch_from_integers_fn {
                                std::move(null_mask));
   }
 
-  // non-integral types throw an exception
-  template <typename T, std::enable_if_t<not std::is_integral_v<T>>* = nullptr>
+  // non-integer types throw an exception
+  template <typename T, std::enable_if_t<not cudf::is_integral_not_bool<T>()>* = nullptr>
   std::unique_ptr<column> operator()(column_view const&,
                                      rmm::cuda_stream_view,
                                      rmm::mr::device_memory_resource*) const
   {
-    CUDF_FAIL("Values for from_integers function must be an integral type.");
+    CUDF_FAIL("Values for from_integers function must be an integer type.");
   }
 };
-
-template <>
-std::unique_ptr<column> dispatch_from_integers_fn::operator()<bool>(
-  column_view const&, rmm::cuda_stream_view, rmm::mr::device_memory_resource*) const
-{
-  CUDF_FAIL("Input for from_integers must not be a boolean type.");
-}
-
 }  // namespace
 
 // This will convert all integer column types into a strings column.
@@ -407,10 +396,11 @@ std::unique_ptr<column> from_integers(column_view const& integers,
 
 // external API
 std::unique_ptr<column> from_integers(column_view const& integers,
+                                      rmm::cuda_stream_view stream,
                                       rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::from_integers(integers, cudf::get_default_stream(), mr);
+  return detail::from_integers(integers, stream, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/convert/convert_ipv4.cu b/cpp/src/strings/convert/convert_ipv4.cu
index adb72cb0263..07e4b3e5b17 100644
--- a/cpp/src/strings/convert/convert_ipv4.cu
+++ b/cpp/src/strings/convert/convert_ipv4.cu
@@ -72,19 +72,19 @@ struct ipv4_to_integers_fn {
 }  // namespace
 
 // Convert strings column of IPv4 addresses to integers column
-std::unique_ptr<column> ipv4_to_integers(strings_column_view const& strings,
+std::unique_ptr<column> ipv4_to_integers(strings_column_view const& input,
                                          rmm::cuda_stream_view stream,
                                          rmm::mr::device_memory_resource* mr)
 {
-  size_type strings_count = strings.size();
+  size_type strings_count = input.size();
   if (strings_count == 0) return make_numeric_column(data_type{type_id::INT64}, 0);
 
-  auto strings_column = column_device_view::create(strings.parent(), stream);
+  auto strings_column = column_device_view::create(input.parent(), stream);
   // create output column copying the strings' null-mask
   auto results   = make_numeric_column(data_type{type_id::INT64},
                                      strings_count,
-                                     cudf::detail::copy_bitmask(strings.parent(), stream, mr),
-                                     strings.null_count(),
+                                     cudf::detail::copy_bitmask(input.parent(), stream, mr),
+                                     input.null_count(),
                                      stream,
                                      mr);
   auto d_results = results->mutable_view().data<int64_t>();
@@ -95,18 +95,19 @@ std::unique_ptr<column> ipv4_to_integers(strings_column_view const& strings,
                     d_results,
                     ipv4_to_integers_fn{*strings_column});
   // done
-  results->set_null_count(strings.null_count());
+  results->set_null_count(input.null_count());
   return results;
 }
 
 }  // namespace detail
 
 // external API
-std::unique_ptr<column> ipv4_to_integers(strings_column_view const& strings,
+std::unique_ptr<column> ipv4_to_integers(strings_column_view const& input,
+                                         rmm::cuda_stream_view stream,
                                          rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::ipv4_to_integers(strings, cudf::get_default_stream(), mr);
+  return detail::ipv4_to_integers(input, stream, mr);
 }
 
 namespace detail {
@@ -173,23 +174,23 @@ std::unique_ptr<column> integers_to_ipv4(column_view const& integers,
                              cudf::detail::copy_bitmask(integers, stream, mr));
 }
 
-std::unique_ptr<column> is_ipv4(strings_column_view const& strings,
+std::unique_ptr<column> is_ipv4(strings_column_view const& input,
                                 rmm::cuda_stream_view stream,
                                 rmm::mr::device_memory_resource* mr)
 {
-  auto strings_column = column_device_view::create(strings.parent(), stream);
+  auto strings_column = column_device_view::create(input.parent(), stream);
   auto d_column       = *strings_column;
   // create output column
   auto results   = make_numeric_column(data_type{type_id::BOOL8},
-                                     strings.size(),
-                                     cudf::detail::copy_bitmask(strings.parent(), stream, mr),
-                                     strings.null_count(),
+                                     input.size(),
+                                     cudf::detail::copy_bitmask(input.parent(), stream, mr),
+                                     input.null_count(),
                                      stream,
                                      mr);
   auto d_results = results->mutable_view().data<bool>();
   thrust::transform(rmm::exec_policy(stream),
                     thrust::make_counting_iterator<size_type>(0),
-                    thrust::make_counting_iterator<size_type>(strings.size()),
+                    thrust::make_counting_iterator<size_type>(input.size()),
                     d_results,
                     [d_column] __device__(size_type idx) {
                       if (d_column.is_null(idx)) return false;
@@ -214,7 +215,7 @@ std::unique_ptr<column> is_ipv4(strings_column_view const& strings,
                       return ip_vals[0] >= 0 && ip_vals[1] >= 0 && ip_vals[2] >= 0 &&
                              ip_vals[3] >= 0;
                     });
-  results->set_null_count(strings.null_count());
+  results->set_null_count(input.null_count());
   return results;
 }
 
@@ -223,17 +224,19 @@ std::unique_ptr<column> is_ipv4(strings_column_view const& strings,
 // external API
 
 std::unique_ptr<column> integers_to_ipv4(column_view const& integers,
+                                         rmm::cuda_stream_view stream,
                                          rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::integers_to_ipv4(integers, cudf::get_default_stream(), mr);
+  return detail::integers_to_ipv4(integers, stream, mr);
 }
 
-std::unique_ptr<column> is_ipv4(strings_column_view const& strings,
+std::unique_ptr<column> is_ipv4(strings_column_view const& input,
+                                rmm::cuda_stream_view stream,
                                 rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::is_ipv4(strings, cudf::get_default_stream(), mr);
+  return detail::is_ipv4(input, stream, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/convert/convert_lists.cu b/cpp/src/strings/convert/convert_lists.cu
index 3aef37914fd..f9f2b91eb12 100644
--- a/cpp/src/strings/convert/convert_lists.cu
+++ b/cpp/src/strings/convert/convert_lists.cu
@@ -233,10 +233,11 @@ std::unique_ptr<column> format_list_column(lists_column_view const& input,
 std::unique_ptr<column> format_list_column(lists_column_view const& input,
                                            string_scalar const& na_rep,
                                            strings_column_view const& separators,
+                                           rmm::cuda_stream_view stream,
                                            rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::format_list_column(input, na_rep, separators, cudf::get_default_stream(), mr);
+  return detail::format_list_column(input, na_rep, separators, stream, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/convert/convert_urls.cu b/cpp/src/strings/convert/convert_urls.cu
index 9efa148cfd2..511acc38d75 100644
--- a/cpp/src/strings/convert/convert_urls.cu
+++ b/cpp/src/strings/convert/convert_urls.cu
@@ -148,11 +148,12 @@ std::unique_ptr<column> url_encode(strings_column_view const& input,
 }  // namespace detail
 
 // external API
-std::unique_ptr<column> url_encode(strings_column_view const& strings,
+std::unique_ptr<column> url_encode(strings_column_view const& input,
+                                   rmm::cuda_stream_view stream,
                                    rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::url_encode(strings, cudf::get_default_stream(), mr);
+  return detail::url_encode(input, stream, mr);
 }
 
 namespace detail {
@@ -211,7 +212,8 @@ __global__ void url_decode_char_counter(column_device_view const in_strings,
   char* in_chars_shared = temporary_buffer[local_warp_id];
 
   // Loop through strings, and assign each string to a warp.
-  for (size_type row_idx = global_warp_id; row_idx < in_strings.size(); row_idx += nwarps) {
+  for (thread_index_type tidx = global_warp_id; tidx < in_strings.size(); tidx += nwarps) {
+    auto const row_idx = static_cast<size_type>(tidx);
     if (in_strings.is_null(row_idx)) {
       out_counts[row_idx] = 0;
       continue;
@@ -295,7 +297,8 @@ __global__ void url_decode_char_replacer(column_device_view const in_strings,
   char* in_chars_shared = temporary_buffer[local_warp_id];
 
   // Loop through strings, and assign each string to a warp
-  for (size_type row_idx = global_warp_id; row_idx < in_strings.size(); row_idx += nwarps) {
+  for (thread_index_type tidx = global_warp_id; tidx < in_strings.size(); tidx += nwarps) {
+    auto const row_idx = static_cast<size_type>(tidx);
     if (in_strings.is_null(row_idx)) continue;
 
     auto const in_string     = in_strings.element<string_view>(row_idx);
@@ -428,11 +431,12 @@ std::unique_ptr<column> url_decode(strings_column_view const& strings,
 
 // external API
 
-std::unique_ptr<column> url_decode(strings_column_view const& strings,
+std::unique_ptr<column> url_decode(strings_column_view const& input,
+                                   rmm::cuda_stream_view stream,
                                    rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::url_decode(strings, cudf::get_default_stream(), mr);
+  return detail::url_decode(input, stream, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/extract/extract.cu b/cpp/src/strings/extract/extract.cu
index 532053e750e..8edcd167e5c 100644
--- a/cpp/src/strings/extract/extract.cu
+++ b/cpp/src/strings/extract/extract.cu
@@ -131,12 +131,13 @@ std::unique_ptr<table> extract(strings_column_view const& input,
 
 // external API
 
-std::unique_ptr<table> extract(strings_column_view const& strings,
+std::unique_ptr<table> extract(strings_column_view const& input,
                                regex_program const& prog,
+                               rmm::cuda_stream_view stream,
                                rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::extract(strings, prog, cudf::get_default_stream(), mr);
+  return detail::extract(input, prog, stream, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/extract/extract_all.cu b/cpp/src/strings/extract/extract_all.cu
index 8a2f8f0cbfc..0c0d4ae4fbf 100644
--- a/cpp/src/strings/extract/extract_all.cu
+++ b/cpp/src/strings/extract/extract_all.cu
@@ -164,12 +164,13 @@ std::unique_ptr<column> extract_all_record(strings_column_view const& input,
 
 // external API
 
-std::unique_ptr<column> extract_all_record(strings_column_view const& strings,
+std::unique_ptr<column> extract_all_record(strings_column_view const& input,
                                            regex_program const& prog,
+                                           rmm::cuda_stream_view stream,
                                            rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::extract_all_record(strings, prog, cudf::get_default_stream(), mr);
+  return detail::extract_all_record(input, prog, stream, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/filter_chars.cu b/cpp/src/strings/filter_chars.cu
index 3e38b5fa775..9f95fedfe0b 100644
--- a/cpp/src/strings/filter_chars.cu
+++ b/cpp/src/strings/filter_chars.cu
@@ -154,15 +154,16 @@ std::unique_ptr<column> filter_characters(
  * @copydoc cudf::strings::filter_characters
  */
 std::unique_ptr<column> filter_characters(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   std::vector<std::pair<cudf::char_utf8, cudf::char_utf8>> characters_to_filter,
   filter_type keep_characters,
   string_scalar const& replacement,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
   return detail::filter_characters(
-    strings, characters_to_filter, keep_characters, replacement, cudf::get_default_stream(), mr);
+    input, characters_to_filter, keep_characters, replacement, stream, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/like.cu b/cpp/src/strings/like.cu
index 5b91f295efb..93e00592ef2 100644
--- a/cpp/src/strings/like.cu
+++ b/cpp/src/strings/like.cu
@@ -185,19 +185,21 @@ std::unique_ptr<column> like(strings_column_view const& input,
 std::unique_ptr<column> like(strings_column_view const& input,
                              string_scalar const& pattern,
                              string_scalar const& escape_character,
+                             rmm::cuda_stream_view stream,
                              rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::like(input, pattern, escape_character, cudf::get_default_stream(), mr);
+  return detail::like(input, pattern, escape_character, stream, mr);
 }
 
 std::unique_ptr<column> like(strings_column_view const& input,
                              strings_column_view const& patterns,
                              string_scalar const& escape_character,
+                             rmm::cuda_stream_view stream,
                              rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::like(input, patterns, escape_character, cudf::get_default_stream(), mr);
+  return detail::like(input, patterns, escape_character, stream, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/padding.cu b/cpp/src/strings/padding.cu
index c501a8bf7b4..850ccaa4535 100644
--- a/cpp/src/strings/padding.cu
+++ b/cpp/src/strings/padding.cu
@@ -168,18 +168,20 @@ std::unique_ptr<column> pad(strings_column_view const& input,
                             size_type width,
                             side_type side,
                             std::string_view fill_char,
+                            rmm::cuda_stream_view stream,
                             rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::pad(input, width, side, fill_char, cudf::get_default_stream(), mr);
+  return detail::pad(input, width, side, fill_char, stream, mr);
 }
 
 std::unique_ptr<column> zfill(strings_column_view const& input,
                               size_type width,
+                              rmm::cuda_stream_view stream,
                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::zfill(input, width, cudf::get_default_stream(), mr);
+  return detail::zfill(input, width, stream, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/repeat_strings.cu b/cpp/src/strings/repeat_strings.cu
index 396e1e6a2ac..847a64f5602 100644
--- a/cpp/src/strings/repeat_strings.cu
+++ b/cpp/src/strings/repeat_strings.cu
@@ -67,7 +67,7 @@ std::unique_ptr<string_scalar> repeat_string(string_scalar const& input,
                       return in_ptr[idx % str_size];
                     });
 
-  return std::make_unique<string_scalar>(std::move(buff));
+  return std::make_unique<string_scalar>(std::move(buff), true, stream, mr);
 }
 
 namespace {
@@ -260,26 +260,29 @@ std::unique_ptr<column> repeat_strings(strings_column_view const& input,
 
 std::unique_ptr<string_scalar> repeat_string(string_scalar const& input,
                                              size_type repeat_times,
+                                             rmm::cuda_stream_view stream,
                                              rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::repeat_string(input, repeat_times, cudf::get_default_stream(), mr);
+  return detail::repeat_string(input, repeat_times, stream, mr);
 }
 
 std::unique_ptr<column> repeat_strings(strings_column_view const& input,
                                        size_type repeat_times,
+                                       rmm::cuda_stream_view stream,
                                        rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::repeat_strings(input, repeat_times, cudf::get_default_stream(), mr);
+  return detail::repeat_strings(input, repeat_times, stream, mr);
 }
 
 std::unique_ptr<column> repeat_strings(strings_column_view const& input,
                                        column_view const& repeat_times,
+                                       rmm::cuda_stream_view stream,
                                        rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::repeat_strings(input, repeat_times, cudf::get_default_stream(), mr);
+  return detail::repeat_strings(input, repeat_times, stream, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/replace/backref_re.cu b/cpp/src/strings/replace/backref_re.cu
index 31e06aac72b..74f38cbcc20 100644
--- a/cpp/src/strings/replace/backref_re.cu
+++ b/cpp/src/strings/replace/backref_re.cu
@@ -148,10 +148,11 @@ std::unique_ptr<column> replace_with_backrefs(strings_column_view const& input,
 std::unique_ptr<column> replace_with_backrefs(strings_column_view const& strings,
                                               regex_program const& prog,
                                               std::string_view replacement,
+                                              rmm::cuda_stream_view stream,
                                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::replace_with_backrefs(strings, prog, replacement, cudf::get_default_stream(), mr);
+  return detail::replace_with_backrefs(strings, prog, replacement, stream, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/replace/multi.cu b/cpp/src/strings/replace/multi.cu
index 92ace4e7bc7..f80ace57c69 100644
--- a/cpp/src/strings/replace/multi.cu
+++ b/cpp/src/strings/replace/multi.cu
@@ -383,7 +383,7 @@ std::unique_ptr<column> replace_character_parallel(strings_column_view const& in
                              std::move(offsets),
                              std::move(chars->release().children.back()),
                              input.null_count(),
-                             copy_bitmask(input.parent(), stream, mr));
+                             cudf::detail::copy_bitmask(input.parent(), stream, mr));
 }
 
 /**
@@ -490,10 +490,11 @@ std::unique_ptr<column> replace(strings_column_view const& input,
 std::unique_ptr<column> replace(strings_column_view const& strings,
                                 strings_column_view const& targets,
                                 strings_column_view const& repls,
+                                rmm::cuda_stream_view stream,
                                 rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::replace(strings, targets, repls, cudf::get_default_stream(), mr);
+  return detail::replace(strings, targets, repls, stream, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/replace/multi_re.cu b/cpp/src/strings/replace/multi_re.cu
index 867b443c036..3375cb7a789 100644
--- a/cpp/src/strings/replace/multi_re.cu
+++ b/cpp/src/strings/replace/multi_re.cu
@@ -206,10 +206,11 @@ std::unique_ptr<column> replace_re(strings_column_view const& strings,
                                    std::vector<std::string> const& patterns,
                                    strings_column_view const& replacements,
                                    regex_flags const flags,
+                                   rmm::cuda_stream_view stream,
                                    rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::replace_re(strings, patterns, replacements, flags, cudf::get_default_stream(), mr);
+  return detail::replace_re(strings, patterns, replacements, flags, stream, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/replace/replace.cu b/cpp/src/strings/replace/replace.cu
index a622d1a742d..a6a14f27dec 100644
--- a/cpp/src/strings/replace/replace.cu
+++ b/cpp/src/strings/replace/replace.cu
@@ -97,7 +97,7 @@ struct replace_row_parallel_fn {
       } else {
         bytes += d_repl.size_bytes() - d_target.size_bytes();
       }
-      position = d_str.find(d_target, position + d_target.size_bytes());
+      position = d_str.find(d_target, position + d_target.length());
       --max_n;
     }
     if (out_ptr)  // copy whats left (or right depending on your point of view)
@@ -751,21 +751,23 @@ std::unique_ptr<column> replace_nulls(strings_column_view const& strings,
 std::unique_ptr<column> replace(strings_column_view const& strings,
                                 string_scalar const& target,
                                 string_scalar const& repl,
-                                int32_t maxrepl,
+                                cudf::size_type maxrepl,
+                                rmm::cuda_stream_view stream,
                                 rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::replace(strings, target, repl, maxrepl, cudf::get_default_stream(), mr);
+  return detail::replace(strings, target, repl, maxrepl, stream, mr);
 }
 
 std::unique_ptr<column> replace_slice(strings_column_view const& strings,
                                       string_scalar const& repl,
                                       size_type start,
                                       size_type stop,
+                                      rmm::cuda_stream_view stream,
                                       rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::replace_slice(strings, repl, start, stop, cudf::get_default_stream(), mr);
+  return detail::replace_slice(strings, repl, start, stop, stream, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/replace/replace_re.cu b/cpp/src/strings/replace/replace_re.cu
index 81ddb937be5..502d5f1a52e 100644
--- a/cpp/src/strings/replace/replace_re.cu
+++ b/cpp/src/strings/replace/replace_re.cu
@@ -134,11 +134,11 @@ std::unique_ptr<column> replace_re(strings_column_view const& strings,
                                    regex_program const& prog,
                                    string_scalar const& replacement,
                                    std::optional<size_type> max_replace_count,
+                                   rmm::cuda_stream_view stream,
                                    rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::replace_re(
-    strings, prog, replacement, max_replace_count, cudf::get_default_stream(), mr);
+  return detail::replace_re(strings, prog, replacement, max_replace_count, stream, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/reverse.cu b/cpp/src/strings/reverse.cu
index 090705ac25d..2855bdbb827 100644
--- a/cpp/src/strings/reverse.cu
+++ b/cpp/src/strings/reverse.cu
@@ -79,10 +79,11 @@ std::unique_ptr<column> reverse(strings_column_view const& input,
 }  // namespace detail
 
 std::unique_ptr<column> reverse(strings_column_view const& input,
+                                rmm::cuda_stream_view stream,
                                 rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::reverse(input, cudf::get_default_stream(), mr);
+  return detail::reverse(input, stream, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/scan/scan_inclusive.cu b/cpp/src/strings/scan/scan_inclusive.cu
new file mode 100644
index 00000000000..0cf492fa295
--- /dev/null
+++ b/cpp/src/strings/scan/scan_inclusive.cu
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/gather.hpp>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/utilities/device_operators.cuh>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/scan.h>
+#include <thrust/scatter.h>
+
+namespace cudf {
+namespace strings {
+namespace detail {
+namespace {
+
+/**
+ * @brief Min/Max inclusive scan operator
+ *
+ * This operator will accept index values, check them and then
+ * run the `Op` operation on the individual element objects.
+ * The returned result is the appropriate index value.
+ *
+ * This was specifically created to workaround a thrust issue
+ * https://github.com/NVIDIA/thrust/issues/1479
+ * where invalid values are passed to the operator.
+ */
+template <typename Element, typename Op>
+struct min_max_scan_operator {
+  column_device_view const col;      ///< strings column device view
+  Element const null_replacement{};  ///< value used when element is null
+  bool const has_nulls;              ///< true if col has null elements
+
+  min_max_scan_operator(column_device_view const& col, bool has_nulls = true)
+    : col{col}, null_replacement{Op::template identity<Element>()}, has_nulls{has_nulls}
+  {
+    // verify validity bitmask is non-null, otherwise, is_null_nocheck() will crash
+    if (has_nulls) CUDF_EXPECTS(col.nullable(), "column with nulls must have a validity bitmask");
+  }
+
+  __device__ inline size_type operator()(size_type lhs, size_type rhs) const
+  {
+    // thrust::inclusive_scan may pass us garbage values so we need to protect ourselves;
+    // in these cases the return value does not matter since the result is not used
+    if (lhs < 0 || rhs < 0 || lhs >= col.size() || rhs >= col.size()) return 0;
+    Element d_lhs =
+      has_nulls && col.is_null_nocheck(lhs) ? null_replacement : col.element<Element>(lhs);
+    Element d_rhs =
+      has_nulls && col.is_null_nocheck(rhs) ? null_replacement : col.element<Element>(rhs);
+    return Op{}(d_lhs, d_rhs) == d_lhs ? lhs : rhs;
+  }
+};
+
+struct null_iterator {
+  bitmask_type const* mask;
+  __device__ bool operator()(size_type idx) const { return !bit_is_set(mask, idx); }
+};
+
+}  // namespace
+
+template <typename Op>
+std::unique_ptr<column> scan_inclusive(column_view const& input,
+                                       bitmask_type const* mask,
+                                       rmm::cuda_stream_view stream,
+                                       rmm::mr::device_memory_resource* mr)
+{
+  auto d_input = column_device_view::create(input, stream);
+
+  // build indices of the scan operation results
+  rmm::device_uvector<size_type> result_map(input.size(), stream);
+  thrust::inclusive_scan(rmm::exec_policy(stream),
+                         thrust::counting_iterator<size_type>(0),
+                         thrust::counting_iterator<size_type>(input.size()),
+                         result_map.begin(),
+                         min_max_scan_operator<cudf::string_view, Op>{*d_input, input.has_nulls()});
+
+  if (input.has_nulls()) {
+    // fill the null rows with out-of-bounds values so gather records them as null;
+    // this prevents un-sanitized null entries in the output
+    auto null_itr = cudf::detail::make_counting_transform_iterator(0, null_iterator{mask});
+    auto oob_val  = thrust::constant_iterator<size_type>(input.size());
+    thrust::scatter_if(rmm::exec_policy(stream),
+                       oob_val,
+                       oob_val + input.size(),
+                       thrust::counting_iterator<size_type>(0),
+                       null_itr,
+                       result_map.data());
+  }
+
+  // call gather using the indices to build the output column
+  auto result_table = cudf::detail::gather(cudf::table_view({input}),
+                                           result_map,
+                                           cudf::out_of_bounds_policy::NULLIFY,
+                                           cudf::detail::negative_index_policy::NOT_ALLOWED,
+                                           stream,
+                                           mr);
+  return std::move(result_table->release().front());
+}
+
+template std::unique_ptr<column> scan_inclusive<DeviceMin>(column_view const& input,
+                                                           bitmask_type const* mask,
+                                                           rmm::cuda_stream_view stream,
+                                                           rmm::mr::device_memory_resource* mr);
+
+template std::unique_ptr<column> scan_inclusive<DeviceMax>(column_view const& input,
+                                                           bitmask_type const* mask,
+                                                           rmm::cuda_stream_view stream,
+                                                           rmm::mr::device_memory_resource* mr);
+
+}  // namespace detail
+}  // namespace strings
+}  // namespace cudf
diff --git a/cpp/src/strings/slice.cu b/cpp/src/strings/slice.cu
index cce6a19a5a6..5a1fee92c7d 100644
--- a/cpp/src/strings/slice.cu
+++ b/cpp/src/strings/slice.cu
@@ -248,20 +248,21 @@ std::unique_ptr<column> slice_strings(strings_column_view const& strings,
                                       numeric_scalar<size_type> const& start,
                                       numeric_scalar<size_type> const& stop,
                                       numeric_scalar<size_type> const& step,
+                                      rmm::cuda_stream_view stream,
                                       rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::slice_strings(strings, start, stop, step, cudf::get_default_stream(), mr);
+  return detail::slice_strings(strings, start, stop, step, stream, mr);
 }
 
 std::unique_ptr<column> slice_strings(strings_column_view const& strings,
                                       column_view const& starts_column,
                                       column_view const& stops_column,
+                                      rmm::cuda_stream_view stream,
                                       rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::slice_strings(
-    strings, starts_column, stops_column, cudf::get_default_stream(), mr);
+  return detail::slice_strings(strings, starts_column, stops_column, stream, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/split/partition.cu b/cpp/src/strings/split/partition.cu
index 0c7d119ea38..16e6402cfef 100644
--- a/cpp/src/strings/split/partition.cu
+++ b/cpp/src/strings/split/partition.cu
@@ -239,20 +239,22 @@ std::unique_ptr<table> rpartition(strings_column_view const& strings,
 
 // external APIs
 
-std::unique_ptr<table> partition(strings_column_view const& strings,
+std::unique_ptr<table> partition(strings_column_view const& input,
                                  string_scalar const& delimiter,
+                                 rmm::cuda_stream_view stream,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::partition(strings, delimiter, cudf::get_default_stream(), mr);
+  return detail::partition(input, delimiter, stream, mr);
 }
 
-std::unique_ptr<table> rpartition(strings_column_view const& strings,
+std::unique_ptr<table> rpartition(strings_column_view const& input,
                                   string_scalar const& delimiter,
+                                  rmm::cuda_stream_view stream,
                                   rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::rpartition(strings, delimiter, cudf::get_default_stream(), mr);
+  return detail::rpartition(input, delimiter, stream, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/split/split_re.cu b/cpp/src/strings/split/split_re.cu
index 3be5937297f..045aac279e6 100644
--- a/cpp/src/strings/split/split_re.cu
+++ b/cpp/src/strings/split/split_re.cu
@@ -290,7 +290,7 @@ std::unique_ptr<column> split_record_re(strings_column_view const& input,
                            std::move(offsets),
                            std::move(strings_output),
                            input.null_count(),
-                           copy_bitmask(input.parent(), stream, mr),
+                           cudf::detail::copy_bitmask(input.parent(), stream, mr),
                            stream,
                            mr);
 }
@@ -340,37 +340,41 @@ std::unique_ptr<column> rsplit_record_re(strings_column_view const& input,
 std::unique_ptr<table> split_re(strings_column_view const& input,
                                 regex_program const& prog,
                                 size_type maxsplit,
+                                rmm::cuda_stream_view stream,
                                 rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::split_re(input, prog, maxsplit, cudf::get_default_stream(), mr);
+  return detail::split_re(input, prog, maxsplit, stream, mr);
 }
 
 std::unique_ptr<column> split_record_re(strings_column_view const& input,
                                         regex_program const& prog,
                                         size_type maxsplit,
+                                        rmm::cuda_stream_view stream,
                                         rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::split_record_re(input, prog, maxsplit, cudf::get_default_stream(), mr);
+  return detail::split_record_re(input, prog, maxsplit, stream, mr);
 }
 
 std::unique_ptr<table> rsplit_re(strings_column_view const& input,
                                  regex_program const& prog,
                                  size_type maxsplit,
+                                 rmm::cuda_stream_view stream,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::rsplit_re(input, prog, maxsplit, cudf::get_default_stream(), mr);
+  return detail::rsplit_re(input, prog, maxsplit, stream, mr);
 }
 
 std::unique_ptr<column> rsplit_record_re(strings_column_view const& input,
                                          regex_program const& prog,
                                          size_type maxsplit,
+                                         rmm::cuda_stream_view stream,
                                          rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::rsplit_record_re(input, prog, maxsplit, cudf::get_default_stream(), mr);
+  return detail::rsplit_record_re(input, prog, maxsplit, stream, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/split/split_record.cu b/cpp/src/strings/split/split_record.cu
index 52f27c68111..7a0cfb9ef41 100644
--- a/cpp/src/strings/split/split_record.cu
+++ b/cpp/src/strings/split/split_record.cu
@@ -57,7 +57,7 @@ std::unique_ptr<column> split_record_fn(strings_column_view const& input,
                              std::move(offsets),
                              std::move(results),
                              input.null_count(),
-                             copy_bitmask(input.parent(), stream, mr),
+                             cudf::detail::copy_bitmask(input.parent(), stream, mr),
                              stream,
                              mr);
   }
@@ -72,7 +72,7 @@ std::unique_ptr<column> split_record_fn(strings_column_view const& input,
                            std::move(offsets),
                            std::move(strings_child),
                            input.null_count(),
-                           copy_bitmask(input.parent(), stream, mr),
+                           cudf::detail::copy_bitmask(input.parent(), stream, mr),
                            stream,
                            mr);
 }
@@ -160,7 +160,7 @@ std::unique_ptr<column> whitespace_split_record_fn(strings_column_view const& in
                            std::move(offsets),
                            std::move(strings_output),
                            input.null_count(),
-                           copy_bitmask(input.parent(), stream, mr),
+                           cudf::detail::copy_bitmask(input.parent(), stream, mr),
                            stream,
                            mr);
 }
diff --git a/cpp/src/strings/strip.cu b/cpp/src/strings/strip.cu
index 6fb7c671a87..26df76850f7 100644
--- a/cpp/src/strings/strip.cu
+++ b/cpp/src/strings/strip.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -86,10 +86,11 @@ std::unique_ptr<column> strip(strings_column_view const& input,
 std::unique_ptr<column> strip(strings_column_view const& input,
                               side_type side,
                               string_scalar const& to_strip,
+                              rmm::cuda_stream_view stream,
                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::strip(input, side, to_strip, cudf::get_default_stream(), mr);
+  return detail::strip(input, side, to_strip, stream, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/translate.cu b/cpp/src/strings/translate.cu
index e7b637c52f3..0ca5e103d3d 100644
--- a/cpp/src/strings/translate.cu
+++ b/cpp/src/strings/translate.cu
@@ -124,12 +124,13 @@ std::unique_ptr<column> translate(strings_column_view const& strings,
 
 // external APIs
 
-std::unique_ptr<column> translate(strings_column_view const& strings,
+std::unique_ptr<column> translate(strings_column_view const& input,
                                   std::vector<std::pair<uint32_t, uint32_t>> const& chars_table,
+                                  rmm::cuda_stream_view stream,
                                   rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::translate(strings, chars_table, cudf::get_default_stream(), mr);
+  return detail::translate(input, chars_table, stream, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/wrap.cu b/cpp/src/strings/wrap.cu
index 335908d65d1..aa87a663964 100644
--- a/cpp/src/strings/wrap.cu
+++ b/cpp/src/strings/wrap.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,10 +19,9 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/strings/case.hpp>
-#include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
+#include <cudf/strings/wrap.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 
@@ -133,10 +132,11 @@ std::unique_ptr<column> wrap(strings_column_view const& strings,
 
 std::unique_ptr<column> wrap(strings_column_view const& strings,
                              size_type width,
+                             rmm::cuda_stream_view stream,
                              rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::wrap<detail::execute_wrap>(strings, width, cudf::get_default_stream(), mr);
+  return detail::wrap<detail::execute_wrap>(strings, width, stream, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/structs/scan/scan_inclusive.cu b/cpp/src/structs/scan/scan_inclusive.cu
new file mode 100644
index 00000000000..823e4472960
--- /dev/null
+++ b/cpp/src/structs/scan/scan_inclusive.cu
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <reductions/nested_type_minmax_util.cuh>
+
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/gather.hpp>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/utilities/device_operators.cuh>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/scan.h>
+
+#include <vector>
+
+namespace cudf {
+namespace structs {
+namespace detail {
+namespace {
+
+}  // namespace
+
+template <typename Op>
+std::unique_ptr<column> scan_inclusive(column_view const& input,
+                                       rmm::cuda_stream_view stream,
+                                       rmm::mr::device_memory_resource* mr)
+{
+  // Create a gather map containing indices of the prefix min/max elements.
+  auto gather_map = rmm::device_uvector<size_type>(input.size(), stream);
+  auto const binop_generator =
+    cudf::reduction::detail::comparison_binop_generator::create<Op>(input, stream);
+  thrust::inclusive_scan(rmm::exec_policy(stream),
+                         thrust::counting_iterator<size_type>(0),
+                         thrust::counting_iterator<size_type>(input.size()),
+                         gather_map.begin(),
+                         binop_generator.binop());
+
+  // Gather the children columns of the input column. Must use `get_sliced_child` to properly
+  // handle input in case it is a sliced view.
+  auto const input_children = [&] {
+    auto const it = cudf::detail::make_counting_transform_iterator(
+      0, [structs_view = structs_column_view{input}, &stream](auto const child_idx) {
+        return structs_view.get_sliced_child(child_idx, stream);
+      });
+    return std::vector<column_view>(it, it + input.num_children());
+  }();
+
+  // Gather the children elements of the prefix min/max struct elements for the output.
+  auto scanned_children = cudf::detail::gather(table_view{input_children},
+                                               gather_map,
+                                               cudf::out_of_bounds_policy::DONT_CHECK,
+                                               cudf::detail::negative_index_policy::NOT_ALLOWED,
+                                               stream,
+                                               mr)
+                            ->release();
+
+  // Don't need to set a null mask because that will be handled at the caller.
+  return make_structs_column(
+    input.size(), std::move(scanned_children), 0, rmm::device_buffer{0, stream, mr}, stream, mr);
+}
+
+template std::unique_ptr<column> scan_inclusive<DeviceMin>(column_view const& input_view,
+                                                           rmm::cuda_stream_view stream,
+                                                           rmm::mr::device_memory_resource* mr);
+
+template std::unique_ptr<column> scan_inclusive<DeviceMax>(column_view const& input_view,
+                                                           rmm::cuda_stream_view stream,
+                                                           rmm::mr::device_memory_resource* mr);
+
+}  // namespace detail
+}  // namespace structs
+}  // namespace cudf
diff --git a/cpp/src/text/bpe/byte_pair_encoding.cu b/cpp/src/text/bpe/byte_pair_encoding.cu
new file mode 100644
index 00000000000..5be35119003
--- /dev/null
+++ b/cpp/src/text/bpe/byte_pair_encoding.cu
@@ -0,0 +1,458 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <text/bpe/byte_pair_encoding.cuh>
+
+#include <nvtext/byte_pair_encoding.hpp>
+
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/get_value.cuh>
+#include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/sizes_to_offsets_iterator.cuh>
+#include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/error.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/copy.h>
+#include <thrust/distance.h>
+#include <thrust/execution_policy.h>
+#include <thrust/functional.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/merge.h>
+#include <thrust/remove.h>
+#include <thrust/unique.h>
+
+namespace nvtext {
+namespace detail {
+namespace {
+
+constexpr int block_size = 512;
+
+/**
+ * @brief Produces offsets to unpairable locations in the given chars array
+ *
+ * Launched as a thread per byte of the chars array.
+ * The output is non-zero offsets to locations of unpairable substrings.
+ * An unpairable substring does not exist in the given map and so will
+ * never be paired. Fortunately, this can be used as an artificial
+ * boundary providing increased parallelism in the BPE kernel.
+ *
+ * @tparam MapRefType The type of the map finder object
+ */
+template <typename MapRefType>
+struct bpe_unpairable_offsets_fn {
+  cudf::device_span<char const> d_chars;
+  cudf::size_type offset;
+  MapRefType const d_map;
+  __device__ cudf::size_type operator()(cudf::size_type idx)
+  {
+    if (!cudf::strings::detail::is_begin_utf8_char(d_chars[idx])) { return 0; }
+
+    auto const itr  = d_chars.data() + idx;
+    auto const end  = d_chars.end();
+    auto const lhs  = cudf::string_view(itr, cudf::strings::detail::bytes_in_utf8_byte(*itr));
+    auto const next = itr + lhs.size_bytes();
+    auto output     = 0;
+    if (next < end) {
+      auto const rhs = cudf::string_view(next, cudf::strings::detail::bytes_in_utf8_byte(*next));
+      // see if both halves exist anywhere in the table, if not these are unpairable
+      if (d_map.find(lhs) == d_map.end() && d_map.find(rhs) == d_map.end()) {
+        output = idx + lhs.size_bytes() + offset;  // offset for artificial boundary
+      }
+    }
+    return output;
+  }
+};
+
+/**
+ * @brief Performs byte-pair-encoding
+ *
+ * Computes the locations where the separator will be inserted in `d_spaces_data`.
+ * This is launched as a string per block.
+ *
+ * The process first initializes all characters to 1 per position in `d_spaces_data`.
+ * All pairs are realized and their ranks stored in `d_ranks_data`.
+ *
+ * Iteratively, the minimum rank is located, the corresponding `d_spaces_data` location
+ * is set to 0 resulting in new potential pairs. The process repeats accounting for
+ * the rank of the newly formed pairs.
+ *
+ * Once there are no more rankable pairs, the process finishes and the `d_spaces_data`
+ * values identify the location to insert the separator.
+ *
+ * @tparam MapRefType The type of the map finder object
+ * @param d_strings Input data
+ * @param d_map For looking up individual string candidates
+ * @param d_spaces_data Output the location where separator will be inserted
+ * @param d_ranks_data Working memory to hold pair ranks
+ * @param d_rerank_data Working memory to hold locations where reranking is required
+ */
+template <typename MapRefType>
+__global__ void bpe_parallel_fn(cudf::column_device_view const d_strings,
+                                MapRefType const d_map,
+                                int8_t* d_spaces_data,          // working memory
+                                cudf::size_type* d_ranks_data,  // more working memory
+                                int8_t* d_rerank_data           // and one more working memory
+)
+{
+  // string per block
+  auto const str_idx =
+    static_cast<cudf::size_type>(cudf::detail::grid_1d::global_thread_id() / block_size);
+  auto const lane_idx = static_cast<cudf::size_type>(threadIdx.x);
+
+  auto const d_str = d_strings.element<cudf::string_view>(str_idx);
+  auto const offsets =
+    d_strings.child(cudf::strings_column_view::offsets_column_index).data<cudf::size_type>();
+  auto const offset = offsets[str_idx + d_strings.offset()] - offsets[d_strings.offset()];
+
+  auto const d_spaces   = d_spaces_data + offset;
+  auto const end_spaces = d_spaces + d_str.size_bytes();
+  auto const d_ranks    = d_ranks_data + offset;
+  auto const end_ranks  = d_ranks + d_str.size_bytes();
+  auto const d_rerank   = d_rerank_data + offset;
+  auto const end_rerank = d_rerank + d_str.size_bytes();
+
+  auto constexpr max_rank = cuda::std::numeric_limits<cudf::size_type>::max();
+
+  __shared__ cudf::size_type block_min_rank;
+  using block_reduce = cub::BlockReduce<cudf::size_type, block_size>;
+  __shared__ typename block_reduce::TempStorage temp_storage;
+  auto const num_valid = block_size < d_str.size_bytes() ? block_size : d_str.size_bytes();
+
+  // init all the re-rank identifiers to zero
+  for (auto itr = d_rerank + lane_idx; itr < end_rerank; itr += block_size) {
+    *itr = 0;
+  }
+  // init all ranks to max
+  for (auto itr = d_ranks + lane_idx; itr < end_ranks; itr += block_size) {
+    *itr = max_rank;
+  }
+  // init all spaces to 1 as appropriate
+  for (auto itr = d_spaces + lane_idx; itr < end_spaces; itr += block_size) {
+    auto const index = thrust::distance(d_spaces, itr);
+    *itr = static_cast<int8_t>(cudf::strings::detail::is_begin_utf8_char(d_str.data()[index]));
+  }
+  __syncthreads();
+
+  // for finding the next half of a pair
+  auto next_substr = [d_str, d_spaces, end = end_spaces](int8_t* begin) {
+    auto const next = thrust::find(thrust::seq, begin + 1, end, 1);
+    auto const size = static_cast<cudf::size_type>(thrust::distance(begin, next));
+    return cudf::string_view(d_str.data() + thrust::distance(d_spaces, begin), size);
+  };
+  // for locating adjacent pairs after merging a pair
+  auto find_prev = [begin = d_spaces](int8_t* ptr) {
+    while (ptr > begin && *ptr == 0) {
+      --ptr;
+    }
+    return ptr;
+  };
+
+  auto min_rank = max_rank;
+
+  // store all the initial ranks for each pair
+  // every character but the first one will have a initial rank
+  //
+  // Example:
+  // string:   abcdefghij
+  // spaces:   1111111111
+  // ranks:    *948516327
+  for (auto itr = d_spaces + lane_idx; itr < end_spaces; itr += block_size) {
+    if (*itr == 0) { continue; }  // skips any UTF-8 continuation bytes
+    // resolve pair and lookup its rank
+    auto const lhs      = next_substr(itr);  // retrieve lhs of the pair
+    auto const next_itr = itr + lhs.size_bytes();
+    if (next_itr < end_spaces) {
+      auto const rhs = next_substr(next_itr);  // retrieve rhs of the pair
+      if (!rhs.empty()) {
+        auto rank          = max_rank;
+        auto const mp      = merge_pair_type{lhs, rhs};
+        auto const map_itr = d_map.find(mp);                     // lookup pair in merges table;
+        if (map_itr != d_map.end()) { rank = map_itr->second; }  // found a match;
+        d_ranks[thrust::distance(d_spaces, next_itr)] = rank;    // store the rank
+        if (rank < min_rank) { min_rank = rank; }
+      }
+    }
+  }
+  // compute the min rank across the block
+  auto const reduce_rank = block_reduce(temp_storage).Reduce(min_rank, cub::Min(), num_valid);
+  if (lane_idx == 0) { block_min_rank = reduce_rank; }
+  __syncthreads();
+
+  // loop through the ranks processing the current minimum until there are no more
+  while (block_min_rank < max_rank) {
+    // search the d_ranks for matches to block_min_rank
+    for (auto itr = d_ranks + lane_idx; itr < end_ranks; itr += block_size) {
+      if (*itr == block_min_rank) {
+        auto ptr = itr - 1;  // check for adjacent min-rank (edge-case)
+        while (ptr > d_ranks && *ptr == max_rank) {
+          --ptr;
+        }
+        // set the output value to 0 at this position (erases separator, merges pair)
+        // using example string above, the min-rank is 1 at position 5
+        // string: abcdefghij
+        // spaces: 1111101111  (set position 5 to 0)
+        if (*ptr != block_min_rank) { d_spaces[thrust::distance(d_ranks, itr)] = 0; }
+      }
+    }
+    __syncthreads();
+
+    // identify all the re-rank locations (logic above invalidated adjacent pairs)
+    // using example string above, the adjacent pairs have to be re-ranked
+    // string: abcdefghij
+    // spaces: 1111101111 (pair 'e,f' is now merged)
+    // rerank: 0000101000 ('ef' and 'fg' need re-ranking as 'd,ef' and 'ef,g'
+    for (auto itr = d_ranks + lane_idx; itr < end_ranks; itr += block_size) {
+      auto const index = thrust::distance(d_ranks, itr);
+      if (*itr == block_min_rank && d_spaces[index] == 0) {
+        // find previous pair mid-point
+        auto ptr = find_prev(d_spaces + index - 1);
+        if (ptr > d_spaces) { d_rerank[thrust::distance(d_spaces, ptr)] = 1; }
+        // find next pair mid-point
+        ptr = thrust::find(thrust::seq, d_spaces + index + 1, end_spaces, 1);
+        if (ptr < end_spaces) { d_rerank[thrust::distance(d_spaces, ptr)] = 1; }
+        *itr = max_rank;  // reset this rank
+      }
+    }
+    __syncthreads();
+
+    // compute the ranks for the newly created pairs
+    min_rank = max_rank;  // and record the new minimum along the way
+    for (auto itr = d_rerank + lane_idx; itr < end_rerank; itr += block_size) {
+      auto const index = thrust::distance(d_rerank, itr);
+      auto rank        = d_ranks[index];
+      if (*itr) {
+        *itr = 0;  // reset re-rank
+        // build lhs of pair
+        auto const ptr  = find_prev(d_spaces + index - 1);
+        auto const size = static_cast<cudf::size_type>(thrust::distance(ptr, d_spaces + index));
+        auto const lhs  = cudf::string_view(d_str.data() + thrust::distance(d_spaces, ptr), size);
+        auto const rhs  = next_substr(d_spaces + index);  // retrieve rhs of pair
+        rank            = max_rank;
+        if (!rhs.empty()) {
+          auto const mp      = merge_pair_type{lhs, rhs};
+          auto const map_itr = d_map.find(mp);                     // lookup rank for this pair;
+          if (map_itr != d_map.end()) { rank = map_itr->second; }  // found a match
+        }
+        d_ranks[index] = rank;  // store new rank
+      }
+      if (rank < min_rank) { min_rank = rank; }
+    }
+
+    // re-compute the minimum rank across the block (since new pairs are created above)
+    auto const reduce_rank = block_reduce(temp_storage).Reduce(min_rank, cub::Min(), num_valid);
+    if (lane_idx == 0) { block_min_rank = reduce_rank; }
+    __syncthreads();
+  }  // if no min ranks are found we are done, otherwise start again
+}
+
+/**
+ * @brief Computes the output size of each strings row
+ *
+ * This launches as a string per block.
+ * The non-zero values in `d_spaces_data` for each string is added to
+ * the current string size to produce the total output bytes.
+ *
+ * @param d_strings Input data
+ * @param d_spaces_data Output the location where separator will be inserted
+ * @param d_sizes Output sizes of each row
+ */
+__global__ void bpe_finalize(cudf::column_device_view const d_strings,
+                             int8_t* d_spaces_data,    // where separators are inserted
+                             cudf::size_type* d_sizes  // output sizes of encoded strings
+)
+{
+  // string per block
+  auto const str_idx =
+    static_cast<cudf::size_type>(cudf::detail::grid_1d::global_thread_id() / block_size);
+  auto const lane_idx = static_cast<cudf::size_type>(threadIdx.x);
+
+  if (d_strings.is_null(str_idx)) {
+    d_sizes[str_idx] = 0;
+    return;
+  }
+  auto const d_str = d_strings.element<cudf::string_view>(str_idx);
+  if (d_str.empty()) {
+    d_sizes[str_idx] = 0;
+    return;
+  }
+
+  auto const offsets =
+    d_strings.child(cudf::strings_column_view::offsets_column_index).data<cudf::size_type>();
+  auto const offset = offsets[str_idx + d_strings.offset()] - offsets[d_strings.offset()];
+
+  auto const d_spaces   = d_spaces_data + offset;
+  auto const end_spaces = d_spaces + d_str.size_bytes();
+  auto const num_valid  = block_size < d_str.size_bytes() ? block_size : d_str.size_bytes();
+
+  using block_reduce = cub::BlockReduce<cudf::size_type, block_size>;
+  __shared__ typename block_reduce::TempStorage temp_storage;
+
+  // reset the first position -- no separator to be added here
+  if (lane_idx == 0) { *d_spaces = 0; }
+
+  // compute the output size for this string by counting the resulting separator positions
+  auto bytes = 0;
+  for (auto itr = d_spaces + lane_idx; itr < end_spaces; itr += block_size) {
+    bytes += (*itr > 0);
+  }
+  auto const total_bytes = block_reduce(temp_storage).Sum(bytes, num_valid);
+  if (lane_idx == 0) { d_sizes[str_idx] = total_bytes + d_str.size_bytes(); }
+}
+
+}  // namespace
+
+std::unique_ptr<cudf::column> byte_pair_encoding(cudf::strings_column_view const& input,
+                                                 bpe_merge_pairs const& merge_pairs,
+                                                 cudf::string_scalar const& separator,
+                                                 rmm::cuda_stream_view stream,
+                                                 rmm::mr::device_memory_resource* mr)
+{
+  if (input.is_empty() || input.chars_size() == 0) {
+    return cudf::make_empty_column(cudf::type_id::STRING);
+  }
+
+  CUDF_EXPECTS(separator.is_valid(stream), "separator parameter must be valid");
+  auto const d_separator = separator.value(stream);
+  CUDF_EXPECTS(d_separator.size_bytes() == 1, "for now, separator must be a single-byte character");
+
+  auto const d_strings = cudf::column_device_view::create(input.parent(), stream);
+
+  auto const first_offset  = (input.offset() == 0) ? 0
+                                                   : cudf::detail::get_value<cudf::size_type>(
+                                                      input.offsets(), input.offset(), stream);
+  auto const last_offset   = (input.offset() == 0 && input.size() == input.offsets().size() - 1)
+                               ? input.chars().size()
+                               : cudf::detail::get_value<cudf::size_type>(
+                                 input.offsets(), input.size() + input.offset(), stream);
+  auto const chars_size    = last_offset - first_offset;
+  auto const d_input_chars = input.chars().data<char>() + first_offset;
+
+  auto const offset_data_type = cudf::data_type{cudf::type_to_id<cudf::size_type>()};
+  auto offsets                = cudf::make_numeric_column(
+    offset_data_type, input.size() + 1, cudf::mask_state::UNALLOCATED, stream, mr);
+  auto d_offsets = offsets->mutable_view().data<cudf::size_type>();
+
+  rmm::device_uvector<int8_t> d_spaces(chars_size, stream);  // identifies non-merged pairs
+  // used for various purposes below: unpairable-offsets, pair ranks, separator insert positions
+  rmm::device_uvector<cudf::size_type> d_working(chars_size, stream);
+
+  auto const chars_begin = thrust::counting_iterator<cudf::size_type>(0);
+  auto const chars_end   = thrust::counting_iterator<cudf::size_type>(chars_size);
+
+  {
+    // this kernel locates unpairable sections of strings to create artificial string row
+    // boundaries; the boundary values are recorded as offsets in d_up_offsets
+    auto const d_up_offsets = d_working.data();  // store unpairable offsets here
+    auto const mp_map       = merge_pairs.impl->get_mp_table_ref();  // lookup table
+    auto const d_chars_span = cudf::device_span<char const>(d_input_chars, chars_size);
+    auto up_fn = bpe_unpairable_offsets_fn<decltype(mp_map)>{d_chars_span, first_offset, mp_map};
+    thrust::transform(rmm::exec_policy_nosync(stream), chars_begin, chars_end, d_up_offsets, up_fn);
+    auto const up_end =  // remove all but the unpairable offsets
+      thrust::remove(rmm::exec_policy_nosync(stream), d_up_offsets, d_up_offsets + chars_size, 0);
+    auto const unpairables = thrust::distance(d_up_offsets, up_end);  // number of unpairables
+
+    // new string boundaries created by combining unpairable offsets with the existing offsets
+    auto tmp_offsets = rmm::device_uvector<cudf::size_type>(unpairables + input.size() + 1, stream);
+    thrust::merge(rmm::exec_policy_nosync(stream),
+                  input.offsets_begin(),
+                  input.offsets_end(),
+                  d_up_offsets,
+                  up_end,
+                  tmp_offsets.begin());
+    // remove any adjacent duplicate offsets (i.e. empty or null rows)
+    auto const offsets_end =
+      thrust::unique(rmm::exec_policy_nosync(stream), tmp_offsets.begin(), tmp_offsets.end());
+    auto const offsets_total =
+      static_cast<cudf::size_type>(thrust::distance(tmp_offsets.begin(), offsets_end));
+    tmp_offsets.resize(offsets_total, stream);
+
+    // temp column created with the merged offsets and the original chars data
+    auto const col_offsets =
+      cudf::column_view(cudf::device_span<cudf::size_type const>(tmp_offsets));
+    auto const tmp_size  = offsets_total - 1;
+    auto const tmp_input = cudf::column_view(
+      input.parent().type(), tmp_size, nullptr, nullptr, 0, 0, {col_offsets, input.chars()});
+    auto const d_tmp_strings = cudf::column_device_view::create(tmp_input, stream);
+
+    // launch the byte-pair-encoding kernel on the temp column
+    rmm::device_uvector<int8_t> d_rerank(chars_size, stream);  // more working memory;
+    auto const d_ranks  = d_working.data();                    // store pair ranks here
+    auto const pair_map = merge_pairs.impl->get_merge_pairs_ref();
+    bpe_parallel_fn<decltype(pair_map)><<<tmp_size, block_size, 0, stream.value()>>>(
+      *d_tmp_strings, pair_map, d_spaces.data(), d_ranks, d_rerank.data());
+  }
+
+  // compute the output sizes and store them in the d_offsets vector
+  bpe_finalize<<<input.size(), block_size, 0, stream.value()>>>(
+    *d_strings, d_spaces.data(), d_offsets);
+
+  // convert sizes to offsets in-place
+  auto const bytes =
+    cudf::detail::sizes_to_offsets(d_offsets, d_offsets + input.size() + 1, d_offsets, stream);
+  CUDF_EXPECTS(bytes <= static_cast<int64_t>(std::numeric_limits<cudf::size_type>::max()),
+               "Size of output exceeds the column size limit",
+               std::overflow_error);
+
+  // build the output: inserting separators to the input character data
+  auto chars   = cudf::strings::detail::create_chars_child_column(bytes, stream, mr);
+  auto d_chars = chars->mutable_view().data<char>();
+
+  auto const d_inserts     = d_working.data();  // stores the insert positions
+  auto offsets_at_non_zero = [d_spaces = d_spaces.data()] __device__(auto idx) {
+    return d_spaces[idx] > 0;  // separator to be inserted here
+  };
+  auto const copy_end = thrust::copy_if(
+    rmm::exec_policy_nosync(stream), chars_begin + 1, chars_end, d_inserts, offsets_at_non_zero);
+
+  // this will insert the single-byte separator into positions specified in d_inserts
+  auto const sep_char = thrust::constant_iterator<char>(separator.to_string(stream)[0]);
+  thrust::merge_by_key(rmm::exec_policy_nosync(stream),
+                       d_inserts,      // where to insert separator byte
+                       copy_end,       //
+                       chars_begin,    // all indices
+                       chars_end,      //
+                       sep_char,       // byte to insert
+                       d_input_chars,  // original data
+                       thrust::make_discard_iterator(),
+                       d_chars);  // result
+
+  return cudf::make_strings_column(input.size(),
+                                   std::move(offsets),
+                                   std::move(chars),
+                                   input.null_count(),
+                                   cudf::detail::copy_bitmask(input.parent(), stream, mr));
+}
+
+}  // namespace detail
+
+std::unique_ptr<cudf::column> byte_pair_encoding(cudf::strings_column_view const& input,
+                                                 bpe_merge_pairs const& merges_table,
+                                                 cudf::string_scalar const& separator,
+                                                 rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::byte_pair_encoding(input, merges_table, separator, cudf::get_default_stream(), mr);
+}
+
+}  // namespace nvtext
diff --git a/cpp/src/text/bpe/byte_pair_encoding.cuh b/cpp/src/text/bpe/byte_pair_encoding.cuh
new file mode 100644
index 00000000000..2a170317909
--- /dev/null
+++ b/cpp/src/text/bpe/byte_pair_encoding.cuh
@@ -0,0 +1,195 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <nvtext/byte_pair_encoding.hpp>
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/hashing/detail/hash_allocator.cuh>
+#include <cudf/hashing/detail/hashing.hpp>
+#include <cudf/hashing/detail/murmurhash3_x86_32.cuh>
+#include <cudf/strings/string_view.cuh>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/mr/device/polymorphic_allocator.hpp>
+
+#include <cuco/static_map.cuh>
+
+#include <thrust/distance.h>
+#include <thrust/execution_policy.h>
+#include <thrust/find.h>
+#include <thrust/pair.h>
+
+#include <cstdint>
+#include <type_traits>
+
+namespace nvtext {
+namespace detail {
+
+using string_hasher_type = cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>;
+using hash_value_type    = string_hasher_type::result_type;
+using merge_pair_type    = thrust::pair<cudf::string_view, cudf::string_view>;
+
+using hash_table_allocator_type = rmm::mr::stream_allocator_adaptor<default_allocator<char>>;
+
+/**
+ * @brief Hasher function used for building and using the cuco static-map
+ *
+ * This takes advantage of heterogeneous lookup feature in cuco static-map which
+ * allows inserting with one type (index) and looking up with a different type (merge_pair_type).
+ *
+ * The merge-pairs are in adjacent rows so each index will access two rows of string values.
+ * The hash of each string is combined for the returned result.
+ */
+struct bpe_hasher {
+  cudf::column_device_view const d_strings;
+  string_hasher_type hasher{};
+  // used by insert
+  __device__ hash_value_type operator()(cudf::size_type index) const
+  {
+    index *= 2;
+    auto const lhs = d_strings.element<cudf::string_view>(index);
+    auto const rhs = d_strings.element<cudf::string_view>(index + 1);
+    return cudf::hashing::detail::hash_combine(hasher(lhs), hasher(rhs));
+  }
+  // used by find
+  __device__ hash_value_type operator()(merge_pair_type const& mp) const
+  {
+    return cudf::hashing::detail::hash_combine(hasher(mp.first), hasher(mp.second));
+  }
+};
+
+/**
+ * @brief Equal function used for building and using the cuco static-map
+ *
+ * This takes advantage of heterogeneous lookup feature in cuco static-map which
+ * allows inserting with one type (index) and looking up with a different type (merge_pair_type).
+ *
+ * The merge-pairs are in adjacent rows so each index will access two rows of string values.
+ * All rows from the input merge-pairs are unique.
+ */
+struct bpe_equal {
+  cudf::column_device_view const d_strings;
+  // used by insert
+  __device__ bool operator()(cudf::size_type lhs, cudf::size_type rhs) const noexcept
+  {
+    return lhs == rhs;  // all rows are unique
+  }
+  // used by find
+  __device__ bool operator()(cudf::size_type lhs, merge_pair_type const& rhs) const noexcept
+  {
+    lhs *= 2;
+    auto const left  = d_strings.element<cudf::string_view>(lhs);
+    auto const right = d_strings.element<cudf::string_view>(lhs + 1);
+    return (left == rhs.first) && (right == rhs.second);
+  }
+};
+
+using bpe_probe_scheme = cuco::experimental::linear_probing<1, bpe_hasher>;
+
+using merge_pairs_map_type = cuco::experimental::static_map<cudf::size_type,
+                                                            cudf::size_type,
+                                                            cuco::experimental::extent<std::size_t>,
+                                                            cuda::thread_scope_device,
+                                                            bpe_equal,
+                                                            bpe_probe_scheme,
+                                                            hash_table_allocator_type>;
+
+/**
+ * @brief Hasher function used for building and using the cuco static-map
+ *
+ * This takes advantage of heterogeneous lookup feature in cuco static-map which
+ * allows inserting with one type (index) and looking up with a different type (merge_pair_type).
+ *
+ * Each component of the merge-pairs (left and right) are stored individually in the map.
+ */
+struct mp_hasher {
+  cudf::column_device_view const d_strings;
+  string_hasher_type hasher{};
+  // used by insert
+  __device__ hash_value_type operator()(cudf::size_type index) const
+  {
+    auto const d_str = d_strings.element<cudf::string_view>(index);
+    return hasher(d_str);
+  }
+  // used by find
+  __device__ hash_value_type operator()(cudf::string_view const& d_str) const
+  {
+    return hasher(d_str);
+  }
+};
+
+/**
+ * @brief Equal function used for building and using the cuco static-map
+ *
+ * This takes advantage of heterogeneous lookup feature in cuco static-map which
+ * allows inserting with one type (index) and looking up with a different type (string).
+ */
+struct mp_equal {
+  cudf::column_device_view const d_strings;
+  // used by insert
+  __device__ bool operator()(cudf::size_type lhs, cudf::size_type rhs) const noexcept
+  {
+    auto const left  = d_strings.element<cudf::string_view>(lhs);
+    auto const right = d_strings.element<cudf::string_view>(rhs);
+    return left == right;
+  }
+  // used by find
+  __device__ bool operator()(cudf::size_type lhs, cudf::string_view const& rhs) const noexcept
+  {
+    auto const left = d_strings.element<cudf::string_view>(lhs);
+    return left == rhs;
+  }
+};
+
+using mp_probe_scheme = cuco::experimental::linear_probing<1, mp_hasher>;
+
+using mp_table_map_type = cuco::experimental::static_map<cudf::size_type,
+                                                         cudf::size_type,
+                                                         cuco::experimental::extent<std::size_t>,
+                                                         cuda::thread_scope_device,
+                                                         mp_equal,
+                                                         mp_probe_scheme,
+                                                         hash_table_allocator_type>;
+
+}  // namespace detail
+
+// since column_device_view::create() returns is a little more than
+// std::unique_ptr<column_device_view> this helper simplifies the return type for us
+using col_device_view = std::invoke_result_t<decltype(&cudf::column_device_view::create),
+                                             cudf::column_view,
+                                             rmm::cuda_stream_view>;
+
+struct bpe_merge_pairs::bpe_merge_pairs_impl {
+  std::unique_ptr<cudf::column> const merge_pairs;
+  col_device_view const d_merge_pairs;
+  std::unique_ptr<detail::merge_pairs_map_type> merge_pairs_map;  // for BPE
+  std::unique_ptr<detail::mp_table_map_type> mp_table_map;        // for locating unpairables
+
+  bpe_merge_pairs_impl(std::unique_ptr<cudf::column>&& merge_pairs,
+                       col_device_view&& d_merge_pairs,
+                       std::unique_ptr<detail::merge_pairs_map_type>&& merge_pairs_map,
+                       std::unique_ptr<detail::mp_table_map_type>&& mp_table_map);
+
+  auto const get_merge_pairs() const { return *d_merge_pairs; }
+  auto get_merge_pairs_ref() const { return merge_pairs_map->ref(cuco::experimental::op::find); }
+  auto get_mp_table_ref() const { return mp_table_map->ref(cuco::experimental::op::find); }
+};
+
+}  // namespace nvtext
diff --git a/cpp/src/text/subword/load_merges_file.cu b/cpp/src/text/bpe/load_merge_pairs.cu
similarity index 67%
rename from cpp/src/text/subword/load_merges_file.cu
rename to cpp/src/text/bpe/load_merge_pairs.cu
index db6ad2e2dd2..80073df5804 100644
--- a/cpp/src/text/subword/load_merges_file.cu
+++ b/cpp/src/text/bpe/load_merge_pairs.cu
@@ -14,22 +14,21 @@
  * limitations under the License.
  */
 
-#include <text/subword/bpe_tokenizer.cuh>
+#include <text/bpe/byte_pair_encoding.cuh>
 
-#include <nvtext/bpe_tokenize.hpp>
+#include <nvtext/byte_pair_encoding.hpp>
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/strings/split/split.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 
-#include <thrust/functional.h>
-
 #include <fstream>
 #include <iostream>
 #include <vector>
@@ -88,32 +87,51 @@ std::unique_ptr<cudf::column> load_file_to_column(std::string const& filename_me
 std::unique_ptr<detail::merge_pairs_map_type> initialize_merge_pairs_map(
   cudf::column_device_view const& input, rmm::cuda_stream_view stream)
 {
-  // Ensure capacity is at least (size/0.7) as documented here:
-  // https://github.com/NVIDIA/cuCollections/blob/6ec8b6dcdeceea07ab4456d32461a05c18864411/include/cuco/static_map.cuh#L179-L182
   auto merge_pairs_map = std::make_unique<merge_pairs_map_type>(
-    static_cast<size_t>(input.size() * 2),  // capacity is 2x;
+    static_cast<size_t>(input.size()),
     cuco::empty_key{-1},
-    cuco::empty_value{-1},  // empty value is not used
+    cuco::empty_value{-1},
     bpe_equal{input},
-    probe_scheme{bpe_hasher{input}},
+    bpe_probe_scheme{bpe_hasher{input}},
     hash_table_allocator_type{default_allocator<char>{}, stream},
     stream.value());
 
   auto iter = cudf::detail::make_counting_transform_iterator(
     0, [] __device__(cudf::size_type idx) { return cuco::make_pair(idx, idx); });
 
-  merge_pairs_map->insert_async(iter, iter + input.size(), stream.value());
+  merge_pairs_map->insert_async(iter, iter + (input.size() / 2), stream.value());
 
   return merge_pairs_map;
 }
 
+std::unique_ptr<detail::mp_table_map_type> initialize_mp_table_map(
+  cudf::column_device_view const& input, rmm::cuda_stream_view stream)
+{
+  auto mp_table_map = std::make_unique<mp_table_map_type>(
+    static_cast<size_t>(input.size()),
+    cuco::empty_key{-1},
+    cuco::empty_value{-1},
+    mp_equal{input},
+    mp_probe_scheme{mp_hasher{input}},
+    hash_table_allocator_type{default_allocator<char>{}, stream},
+    stream.value());
+
+  auto iter = cudf::detail::make_counting_transform_iterator(
+    0, [] __device__(cudf::size_type idx) { return cuco::make_pair(idx, idx); });
+
+  mp_table_map->insert_async(iter, iter + input.size(), stream.value());
+
+  return mp_table_map;
+}
+
 std::unique_ptr<bpe_merge_pairs::bpe_merge_pairs_impl> create_bpe_merge_pairs_impl(
   std::unique_ptr<cudf::column>&& input, rmm::cuda_stream_view stream)
 {
-  auto d_input     = cudf::column_device_view::create(input->view(), stream);
-  auto merge_pairs = initialize_merge_pairs_map(*d_input, stream);
+  auto d_input      = cudf::column_device_view::create(input->view(), stream);
+  auto merge_pairs  = initialize_merge_pairs_map(*d_input, stream);
+  auto mp_table_map = initialize_mp_table_map(*d_input, stream);
   return std::make_unique<nvtext::bpe_merge_pairs::bpe_merge_pairs_impl>(
-    std::move(input), std::move(d_input), std::move(merge_pairs));
+    std::move(input), std::move(d_input), std::move(merge_pairs), std::move(mp_table_map));
 }
 
 std::unique_ptr<bpe_merge_pairs::bpe_merge_pairs_impl> create_bpe_merge_pairs_impl(
@@ -121,8 +139,9 @@ std::unique_ptr<bpe_merge_pairs::bpe_merge_pairs_impl> create_bpe_merge_pairs_im
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr)
 {
-  return create_bpe_merge_pairs_impl(std::make_unique<cudf::column>(input.parent(), stream, mr),
-                                     stream);
+  auto pairs   = cudf::strings::split_record(input, cudf::string_scalar(" "), 1, stream, mr);
+  auto content = pairs->release();
+  return create_bpe_merge_pairs_impl(std::move(content.children.back()), stream);
 }
 
 }  // namespace
@@ -135,6 +154,15 @@ std::unique_ptr<bpe_merge_pairs> load_merge_pairs_file(std::string const& filena
   return std::make_unique<bpe_merge_pairs>(std::move(input_column), stream, mr);
 }
 
+std::unique_ptr<bpe_merge_pairs> load_merge_pairs(cudf::strings_column_view const& merge_pairs,
+                                                  rmm::cuda_stream_view stream,
+                                                  rmm::mr::device_memory_resource* mr)
+{
+  CUDF_EXPECTS(!merge_pairs.is_empty(), "Merge pairs must not be empty");
+  CUDF_EXPECTS(!merge_pairs.has_nulls(), "Merge pairs may not contain nulls");
+  return std::make_unique<bpe_merge_pairs>(merge_pairs, stream, mr);
+}
+
 }  // namespace detail
 
 std::unique_ptr<bpe_merge_pairs> load_merge_pairs_file(std::string const& filename_merges,
@@ -144,31 +172,42 @@ std::unique_ptr<bpe_merge_pairs> load_merge_pairs_file(std::string const& filena
   return detail::load_merge_pairs_file(filename_merges, cudf::get_default_stream(), mr);
 }
 
+std::unique_ptr<bpe_merge_pairs> load_merge_pairs(cudf::strings_column_view const& merge_pairs,
+                                                  rmm::cuda_stream_view stream,
+                                                  rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::load_merge_pairs(merge_pairs, stream, mr);
+}
+
 bpe_merge_pairs::bpe_merge_pairs_impl::bpe_merge_pairs_impl(
   std::unique_ptr<cudf::column>&& merge_pairs,
   std::unique_ptr<cudf::column_device_view, std::function<void(cudf::column_device_view*)>>&&
     d_merge_pairs,
-  std::unique_ptr<detail::merge_pairs_map_type>&& merge_pairs_map)
+  std::unique_ptr<detail::merge_pairs_map_type>&& merge_pairs_map,
+  std::unique_ptr<detail::mp_table_map_type>&& mp_table_map)
   : merge_pairs(std::move(merge_pairs)),
     d_merge_pairs(std::move(d_merge_pairs)),
-    merge_pairs_map(std::move(merge_pairs_map))
+    merge_pairs_map(std::move(merge_pairs_map)),
+    mp_table_map(std::move(mp_table_map))
 {
 }
 
 bpe_merge_pairs::bpe_merge_pairs(std::unique_ptr<cudf::column>&& input,
                                  rmm::cuda_stream_view stream,
                                  rmm::mr::device_memory_resource*)
-  : impl(detail::create_bpe_merge_pairs_impl(std::move(input), stream))
+  : impl(detail::create_bpe_merge_pairs_impl(std::move(input), stream).release())
 {
 }
 
 bpe_merge_pairs::bpe_merge_pairs(cudf::strings_column_view const& input,
                                  rmm::cuda_stream_view stream,
                                  rmm::mr::device_memory_resource* mr)
-  : impl(detail::create_bpe_merge_pairs_impl(input, stream, mr))
+  : impl(detail::create_bpe_merge_pairs_impl(input, stream, mr).release())
 {
 }
 
-bpe_merge_pairs::~bpe_merge_pairs() = default;
+bpe_merge_pairs::bpe_merge_pairs() = default;
+bpe_merge_pairs::~bpe_merge_pairs() { delete impl; }
 
 }  // namespace nvtext
diff --git a/cpp/src/text/detokenize.cu b/cpp/src/text/detokenize.cu
index a17583cf649..38cb7dd6753 100644
--- a/cpp/src/text/detokenize.cu
+++ b/cpp/src/text/detokenize.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -169,13 +169,14 @@ std::unique_ptr<cudf::column> detokenize(cudf::strings_column_view const& string
 
 }  // namespace detail
 
-std::unique_ptr<cudf::column> detokenize(cudf::strings_column_view const& strings,
+std::unique_ptr<cudf::column> detokenize(cudf::strings_column_view const& input,
                                          cudf::column_view const& row_indices,
                                          cudf::string_scalar const& separator,
+                                         rmm::cuda_stream_view stream,
                                          rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::detokenize(strings, row_indices, separator, cudf::get_default_stream(), mr);
+  return detail::detokenize(input, row_indices, separator, stream, mr);
 }
 
 }  // namespace nvtext
diff --git a/cpp/src/text/edit_distance.cu b/cpp/src/text/edit_distance.cu
index 1460be4fcf5..3d5f2d72e6f 100644
--- a/cpp/src/text/edit_distance.cu
+++ b/cpp/src/text/edit_distance.cu
@@ -224,7 +224,7 @@ std::unique_ptr<cudf::column> edit_distance_matrix(cudf::strings_column_view con
   cudf::size_type n_upper = (strings_count * (strings_count - 1)) / 2;
   rmm::device_uvector<std::ptrdiff_t> offsets(n_upper, stream);
   auto d_offsets = offsets.data();
-  CUDF_CUDA_TRY(cudaMemsetAsync(d_offsets, 0, n_upper * sizeof(cudf::size_type), stream.value()));
+  CUDF_CUDA_TRY(cudaMemsetAsync(d_offsets, 0, n_upper * sizeof(std::ptrdiff_t), stream.value()));
   thrust::for_each_n(
     rmm::exec_policy(stream),
     thrust::make_counting_iterator<cudf::size_type>(0),
diff --git a/cpp/src/text/normalize.cu b/cpp/src/text/normalize.cu
index 1b07b0785f5..0fc1d221b15 100644
--- a/cpp/src/text/normalize.cu
+++ b/cpp/src/text/normalize.cu
@@ -242,22 +242,24 @@ std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view con
 
 // external APIs
 
-std::unique_ptr<cudf::column> normalize_spaces(cudf::strings_column_view const& strings,
+std::unique_ptr<cudf::column> normalize_spaces(cudf::strings_column_view const& input,
+                                               rmm::cuda_stream_view stream,
                                                rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::normalize_spaces(strings, cudf::get_default_stream(), mr);
+  return detail::normalize_spaces(input, stream, mr);
 }
 
 /**
  * @copydoc nvtext::normalize_characters
  */
-std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view const& strings,
+std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view const& input,
                                                    bool do_lower_case,
+                                                   rmm::cuda_stream_view stream,
                                                    rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::normalize_characters(strings, do_lower_case, cudf::get_default_stream(), mr);
+  return detail::normalize_characters(input, do_lower_case, stream, mr);
 }
 
 }  // namespace nvtext
diff --git a/cpp/src/text/replace.cu b/cpp/src/text/replace.cu
index 34916e121dc..a4b28fe2dab 100644
--- a/cpp/src/text/replace.cu
+++ b/cpp/src/text/replace.cu
@@ -274,26 +274,26 @@ std::unique_ptr<cudf::column> filter_tokens(cudf::strings_column_view const& str
 
 // external APIs
 
-std::unique_ptr<cudf::column> replace_tokens(cudf::strings_column_view const& strings,
+std::unique_ptr<cudf::column> replace_tokens(cudf::strings_column_view const& input,
                                              cudf::strings_column_view const& targets,
                                              cudf::strings_column_view const& replacements,
                                              cudf::string_scalar const& delimiter,
+                                             rmm::cuda_stream_view stream,
                                              rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::replace_tokens(
-    strings, targets, replacements, delimiter, cudf::get_default_stream(), mr);
+  return detail::replace_tokens(input, targets, replacements, delimiter, stream, mr);
 }
 
-std::unique_ptr<cudf::column> filter_tokens(cudf::strings_column_view const& strings,
+std::unique_ptr<cudf::column> filter_tokens(cudf::strings_column_view const& input,
                                             cudf::size_type min_token_length,
                                             cudf::string_scalar const& replacement,
                                             cudf::string_scalar const& delimiter,
+                                            rmm::cuda_stream_view stream,
                                             rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::filter_tokens(
-    strings, min_token_length, replacement, delimiter, cudf::get_default_stream(), mr);
+  return detail::filter_tokens(input, min_token_length, replacement, delimiter, stream, mr);
 }
 
 }  // namespace nvtext
diff --git a/cpp/src/text/subword/bpe_tokenizer.cu b/cpp/src/text/subword/bpe_tokenizer.cu
deleted file mode 100644
index 13c744ac6bd..00000000000
--- a/cpp/src/text/subword/bpe_tokenizer.cu
+++ /dev/null
@@ -1,564 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <text/subword/bpe_tokenizer.cuh>
-
-#include <nvtext/bpe_tokenize.hpp>
-
-#include <cudf/column/column_device_view.cuh>
-#include <cudf/column/column_factories.hpp>
-#include <cudf/detail/get_value.cuh>
-#include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/strings/detail/combine.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
-#include <cudf/utilities/default_stream.hpp>
-#include <cudf/utilities/error.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/exec_policy.hpp>
-
-#include <thrust/binary_search.h>
-#include <thrust/copy.h>
-#include <thrust/count.h>
-#include <thrust/distance.h>
-#include <thrust/execution_policy.h>
-#include <thrust/find.h>
-#include <thrust/for_each.h>
-#include <thrust/functional.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/transform_iterator.h>
-#include <thrust/merge.h>
-#include <thrust/pair.h>
-#include <thrust/scan.h>
-#include <thrust/transform.h>
-
-namespace nvtext {
-namespace detail {
-
-namespace {
-
-template <typename CharType>
-constexpr bool is_whitespace(CharType ch)
-{
-  return ch <= ' ';
-}
-
-/**
- * @brief Resolve a substring up to the first whitespace character.
- *
- * This will return a substring of the input starting with the first byte
- * up to the first whitespace character found or the end of the string.
- * Any whitespace is expected only at the end of the string.
- *
- * @param d_str Input string to resolve.
- * @return Substring of the input excluding any trailing whitespace.
- */
-__device__ cudf::string_view get_first_token(cudf::string_view const& d_str)
-{
-  auto const begin = d_str.data();
-  auto const end   = thrust::find_if(
-    thrust::seq, begin, begin + d_str.size_bytes(), [](auto ch) { return is_whitespace(ch); });
-  auto const size = static_cast<cudf::size_type>(thrust::distance(begin, end));
-  return cudf::string_view(begin, size);
-}
-
-/**
- * @brief Main byte pair encoding algorithm function for each string.
- *
- * @see The byte_pair_encoding_fn::operator() function below for details.
- */
-template <typename MapRefType>
-struct byte_pair_encoding_fn {
-  cudf::column_device_view const d_merges;
-  cudf::column_device_view const d_strings;
-  MapRefType const d_map;
-  cudf::size_type* d_sizes;  // output size of encoded string
-  string_hasher_type const hasher;
-  cudf::size_type* d_byte_indices;
-
-  /**
-   * @brief Parse the merge pair into components.
-   *
-   * The two substrings are separated by a single space.
-   *
-   * @param idx Index of merge pair to dissect.
-   * @return The left and right halves of the merge pair.
-   */
-  __device__ thrust::pair<cudf::string_view, cudf::string_view> dissect_merge_pair(
-    cudf::size_type idx)
-  {
-    auto const d_pair  = d_merges.element<cudf::string_view>(idx);
-    auto const lhs     = d_pair.data();
-    auto const end_str = d_pair.data() + d_pair.size_bytes();
-    auto const rhs     = thrust::find(thrust::seq, lhs, end_str, ' ');  // space always expected
-    // check for malformed pair entry to prevent segfault
-    if (rhs == end_str) { return thrust::make_pair(cudf::string_view{}, cudf::string_view{}); }
-    auto const lhs_size = static_cast<cudf::size_type>(thrust::distance(lhs, rhs));
-    auto const rhs_size = static_cast<cudf::size_type>(thrust::distance(rhs + 1, end_str));
-    return thrust::make_pair(cudf::string_view(lhs, lhs_size),
-                             cudf::string_view(rhs + 1, rhs_size));
-  }
-
-  /**
-   * @brief Get the next substring of the given string.
-   *
-   * This will find the next sequence of characters identified by the
-   * given byte indices iterator values. The beginning of the sequence
-   * starts at `begin` and the end of the sequence is the first non-zero
-   * index found between (begin,end) exclusive.
-   *
-   * @tparam Iterator The byte indices iterator type
-   * @param begin Start of indices to check
-   * @param end End of indices to check
-   * @param d_str String to substring
-   * @return The substring found.
-   */
-  template <typename Iterator>
-  __device__ cudf::string_view next_substr(Iterator begin,
-                                           Iterator end,
-                                           cudf::string_view const& d_str)
-  {
-    auto const next = thrust::find_if(thrust::seq, begin + 1, end, [](auto v) { return v != 0; });
-    auto const size = static_cast<cudf::size_type>(thrust::distance(begin, next));
-    return cudf::string_view(d_str.data() + *begin, size);
-  }
-
-  /**
-   * @brief Look up the pair of strings in the d_map/d_merges
-   *
-   * @param lhs Left half of the string
-   * @param rhs Right half of the string
-   * @return Position of merge pair within d_map
-   */
-  __device__ auto get_merge_pair(cudf::string_view const& lhs, cudf::string_view const& rhs)
-  {
-    __shared__ char shmem[48 * 1024];  // max for Pascal
-    auto const total_size         = lhs.size_bytes() + rhs.size_bytes() + 1;
-    auto const thread_memory_size = static_cast<cudf::size_type>(sizeof(shmem) / blockDim.x);
-
-    // Edge case check.
-    // Empirically found only two merge pair strings that were greater than 70 bytes
-    // and they both looked like ignorable errors.
-    if (thread_memory_size < total_size) { return d_map.end(); }
-
-    // build the target string in shared memory
-    char* ptr = &shmem[threadIdx.x * thread_memory_size];
-
-    // build a temp string like:  temp = lhs + ' ' + rhs
-    memcpy(ptr, lhs.data(), lhs.size_bytes());
-    memcpy(ptr + lhs.size_bytes(), " ", 1);
-    memcpy(ptr + lhs.size_bytes() + 1, rhs.data(), rhs.size_bytes());
-
-    auto const d_str = cudf::string_view(ptr, total_size);
-    return d_map.find(d_str);
-  }
-
-  /**
-   * @brief Byte encode each string.
-   *
-   * Each string is iteratively scanned for the minimum rank of adjacent substring pairs
-   * as found within the `d_map` table. Once the minimum pair is located, that pair
-   * is removed -- virtually by zero-ing the index value between any matching adjacent pairs.
-   *
-   * The iteration ends once there are no more adjacent pairs or there are no more
-   * matches found in `d_map`. At the end, the indices for each string reflect the
-   * encoding pattern and can be used to build the output.
-   *
-   * This function also computes the size of the encoded output of each string
-   * by simply counting the number of non-zero indices values remaining. This saves
-   * an extra kernel launch normally required to compute the offsets of the output column.
-   *
-   * @param idx The index of the string in `d_strings` to encode
-   */
-  __device__ void operator()(cudf::size_type idx)
-  {
-    if (d_strings.is_null(idx)) {
-      d_sizes[idx] = 0;
-      return;
-    }
-    auto const d_str = get_first_token(d_strings.element<cudf::string_view>(idx));
-    if (d_str.empty()) {
-      d_sizes[idx] = 0;
-      return;
-    }
-
-    auto const offset = d_strings.child(cudf::strings_column_view::offsets_column_index)
-                          .element<cudf::size_type>(idx);
-    auto const d_indices = d_byte_indices + offset;
-
-    // initialize the byte indices for this string;
-    // set the index value to 0 for any intermediate UTF-8 bytes
-    thrust::transform(thrust::seq,
-                      thrust::make_counting_iterator<cudf::size_type>(0),
-                      thrust::make_counting_iterator<cudf::size_type>(d_str.size_bytes()),
-                      d_indices,
-                      [data = d_str.data()](auto idx) {
-                        auto const byte = static_cast<uint8_t>(data[idx]);
-                        return cudf::strings::detail::is_begin_utf8_char(byte) ? idx : 0;
-                      });
-
-    auto const begin = d_indices;
-    auto const end   = d_indices + d_str.size_bytes();
-
-    // keep processing the string until there are no more adjacent pairs found in d_map
-    cudf::size_type min_rank = 0;
-    while (min_rank < cuda::std::numeric_limits<cudf::size_type>::max()) {
-      // initialize working variables
-      min_rank = cuda::std::numeric_limits<cudf::size_type>::max();
-
-      auto lhs = next_substr(begin, end, d_str);
-      auto itr = begin + lhs.size_bytes();
-
-      auto min_itr  = itr;               // these are set along with
-      auto min_size = lhs.size_bytes();  // the min_rank variable
-
-      // check each adjacent pair against the d_map
-      while (itr < end) {
-        auto const rhs = next_substr(itr, end, d_str);
-        if (rhs.empty()) break;  // no more adjacent pairs
-
-        auto const map_itr = get_merge_pair(lhs, rhs);
-        if (map_itr != d_map.end()) {
-          // found a match; record the rank (and other min_ vars)
-          auto const rank = map_itr->second;
-          if (rank < min_rank) {
-            min_rank = rank;
-            min_itr  = itr;
-            min_size = rhs.size_bytes();
-          }
-        }
-        // next substring
-        lhs = rhs;
-        itr += rhs.size_bytes();
-      }
-
-      // if any pair matched, remove every occurrence from the string
-      if (min_rank < cuda::std::numeric_limits<cudf::size_type>::max()) {
-        // remove the first pair we found
-        itr  = min_itr;
-        *itr = 0;
-
-        // continue scanning for other occurrences in the remainder of the string
-        itr += min_size;
-        if (itr < end) {
-          auto const d_pair = dissect_merge_pair(min_rank);
-
-          lhs = next_substr(itr, end, d_str);
-          itr += lhs.size_bytes();
-          while (itr < end) {
-            auto rhs = next_substr(itr, end, d_str);
-            if (d_pair.first == lhs && d_pair.second == rhs) {
-              *itr = 0;  // removes the pair from this string
-              itr += rhs.size_bytes();
-              if (itr >= end) { break; }  // done checking for pairs
-              // skip to the next adjacent pair
-              rhs = next_substr(itr, end, d_str);
-            }
-            // next substring
-            lhs = rhs;
-            itr += rhs.size_bytes();
-          }
-        }
-      }
-    }
-
-    // compute and store the output size for this string's encoding
-    auto const encoded_size = d_str.size_bytes() +  // number of original bytes +
-                              thrust::count_if(     // number of non-zero byte indices
-                                thrust::seq,
-                                d_indices,
-                                d_indices + d_str.size_bytes(),
-                                [](auto v) { return v != 0; });
-    d_sizes[idx] = static_cast<cudf::size_type>(encoded_size);
-  }
-};
-
-/**
- * @brief Build the output string encoding.
- *
- * This copies each string to the output inserting a space at each non-zero byte index.
- *
- * @code{.txt}
- * d_strings =      ["helloworld", "testthis"]
- * d_byte_indices = [ 0000050000    00004000]
- * result is ["hello world", "test this"]
- * @endcode
- */
-struct build_encoding_fn {
-  cudf::column_device_view const d_strings;
-  cudf::size_type const* d_byte_indices;
-  cudf::size_type const* d_offsets;
-  char* d_chars{};
-
-  __device__ void operator()(cudf::size_type idx)
-  {
-    if (d_strings.is_null(idx)) { return; }
-    auto const d_str = get_first_token(d_strings.element<cudf::string_view>(idx));
-    if (d_str.empty()) { return; }
-
-    auto const offset = d_strings.child(cudf::strings_column_view::offsets_column_index)
-                          .element<cudf::size_type>(idx);
-    auto const d_indices = d_byte_indices + offset;
-    auto d_output        = d_chars ? d_chars + d_offsets[idx] : nullptr;
-
-    // copy chars while indices[i]==0,
-    // insert space each time indices[i]!=0
-    auto const begin = d_indices;
-    auto const end   = d_indices + d_str.size_bytes();
-    auto d_input     = d_str.data();
-    *d_output++      = *d_input++;
-    auto itr         = begin + 1;
-    while (itr < end) {
-      if (*itr++) *d_output++ = ' ';
-      *d_output++ = *d_input++;
-    }
-    // https://github.com/rapidsai/cudf/pull/10270/files#r826319405
-  }
-};
-
-/**
- * @brief Perform byte pair encoding on each string in the input column.
- *
- * The result is a strings column of the same size where each string has been encoded.
- *
- * The encoding is performed iteratively. Each pass determines the string's lowest
- * ranked merge pair as determined by the strings in `merges_table`. This pair
- * is removed (virtually) from each string before starting the next iteration.
- *
- * Once all pairs have exhausted for all strings, the output is constructed from
- * the results by adding spaces between each remaining pair in each string.
- *
- * @param input Strings to encode.
- * @param merge_pairs Merge pairs data and map used for encoding.
- * @param stream CUDA stream used for device memory operations and kernel launches
- */
-std::unique_ptr<cudf::column> byte_pair_encoding(
-  cudf::strings_column_view const& input,
-  bpe_merge_pairs::bpe_merge_pairs_impl const& merge_pairs,
-  rmm::cuda_stream_view stream)
-{
-  auto const d_merges = merge_pairs.get_merge_pairs();
-  CUDF_EXPECTS(d_merges.size() > 0, "Merge pairs table must not be empty");
-
-  // build working vector to hold index values per byte
-  rmm::device_uvector<cudf::size_type> d_byte_indices(input.chars().size(), stream);
-
-  auto const d_strings = cudf::column_device_view::create(input.parent(), stream);
-
-  auto offsets   = cudf::make_numeric_column(cudf::data_type{cudf::type_to_id<cudf::size_type>()},
-                                           static_cast<cudf::size_type>(input.size() + 1),
-                                           cudf::mask_state::UNALLOCATED,
-                                           stream,
-                                           rmm::mr::get_current_device_resource());
-  auto d_offsets = offsets->mutable_view().data<cudf::size_type>();
-
-  auto map_ref = merge_pairs.get_merge_pairs_ref();
-  byte_pair_encoding_fn<decltype(map_ref)> fn{
-    d_merges, *d_strings, map_ref, d_offsets, string_hasher_type{}, d_byte_indices.data()};
-  thrust::for_each_n(
-    rmm::exec_policy(stream), thrust::make_counting_iterator<cudf::size_type>(0), input.size(), fn);
-
-  // build the output: add spaces between the remaining pairs in each string
-  thrust::exclusive_scan(
-    rmm::exec_policy(stream), d_offsets, d_offsets + input.size() + 1, d_offsets);
-
-  auto const bytes =
-    cudf::detail::get_value<cudf::size_type>(offsets->view(), input.size(), stream);
-  auto chars = cudf::strings::detail::create_chars_child_column(
-    bytes, stream, rmm::mr::get_current_device_resource());
-  auto d_chars = chars->mutable_view().data<char>();
-
-  thrust::for_each_n(rmm::exec_policy(stream),
-                     thrust::make_counting_iterator<cudf::size_type>(0),
-                     input.size(),
-                     build_encoding_fn{*d_strings, d_byte_indices.data(), d_offsets, d_chars});
-
-  return make_strings_column(
-    input.size(), std::move(offsets), std::move(chars), 0, rmm::device_buffer{});
-}
-
-/**
- * @brief Detect space to not-space transitions inside each string.
- *
- * This handles sliced input and null strings as well.
- * It is parallelized over bytes and returns true only for valid left edges
- * -- non-space preceded by a space.
- */
-struct edge_of_space_fn {
-  cudf::column_device_view const d_strings;
-  __device__ bool operator()(cudf::size_type offset)
-  {
-    auto const d_chars =
-      d_strings.child(cudf::strings_column_view::chars_column_index).data<char>();
-    if (is_whitespace(d_chars[offset]) || !is_whitespace(d_chars[offset - 1])) { return false; }
-
-    auto const offsets   = d_strings.child(cudf::strings_column_view::offsets_column_index);
-    auto const d_offsets = offsets.data<cudf::size_type>() + d_strings.offset();
-    // ignore offsets outside sliced range
-    if (offset < d_offsets[0] || offset >= d_offsets[d_strings.size()]) { return false; }
-
-    auto itr =
-      thrust::lower_bound(thrust::seq, d_offsets, d_offsets + d_strings.size() + 1, offset);
-    // ignore offsets at existing string boundaries
-    if (*itr == offset) { return false; }
-
-    // count only edges for valid strings
-    auto const index = static_cast<cudf::size_type>(thrust::distance(d_offsets, itr)) - 1;
-    return d_strings.is_valid(index);
-  }
-};
-
-/**
- * @brief Create new offsets by identifying substrings by whitespace.
- *
- * This is similar to cudf::strings::split_record but does not fully split
- * and only returns new offsets. The behavior is more like a view-only slice
- * of the chars child with the result still including trailing delimiters.
- *
- * The encoding algorithm ignores the trailing whitespace of each string.
- *
- * @param input Strings to tokenize.
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @return New offsets including those at the edge of each space.
- */
-std::unique_ptr<cudf::column> space_offsets(cudf::strings_column_view const& input,
-                                            cudf::column_device_view const& d_strings,
-                                            rmm::cuda_stream_view stream)
-{
-  // count space offsets
-  auto const begin = thrust::make_counting_iterator<cudf::size_type>(1);
-  auto const end   = thrust::make_counting_iterator<cudf::size_type>(input.chars().size());
-  edge_of_space_fn edge_of_space{d_strings};
-  auto const space_count = thrust::count_if(rmm::exec_policy(stream), begin, end, edge_of_space);
-
-  // copy space offsets
-  rmm::device_uvector<cudf::size_type> space_offsets(space_count, stream);
-  thrust::copy_if(rmm::exec_policy(stream), begin, end, space_offsets.data(), edge_of_space);
-
-  // create output offsets
-  auto result =
-    cudf::make_numeric_column(cudf::data_type{cudf::type_to_id<cudf::size_type>()},
-                              static_cast<cudf::size_type>(space_count + input.size() + 1),
-                              cudf::mask_state::UNALLOCATED,
-                              stream,
-                              rmm::mr::get_current_device_resource());
-
-  // combine current offsets with space offsets
-  thrust::merge(rmm::exec_policy(stream),
-                input.offsets_begin(),
-                input.offsets_end(),
-                space_offsets.begin(),
-                space_offsets.end(),
-                result->mutable_view().begin<cudf::size_type>());
-
-  return result;
-}
-
-/**
- * @brief Build new offsets that can be used to build a list column for calling join.
- *
- * This essentially returns the number of tokens for each string.
- */
-struct list_offsets_fn {
-  cudf::column_device_view const d_strings;
-  __device__ cudf::size_type operator()(cudf::size_type idx)
-  {
-    if (d_strings.is_null(idx)) return 0;
-    auto const d_str = d_strings.element<cudf::string_view>(idx);
-    if (d_str.empty()) return 1;  // empty is a single valid result
-
-    auto const begin = thrust::make_counting_iterator<cudf::size_type>(1);
-    auto const end   = thrust::make_counting_iterator<cudf::size_type>(d_str.size_bytes());
-
-    // this counts the number of non-adjacent delimiters
-    auto const result =
-      thrust::count_if(thrust::seq, begin, end, [data = d_str.data()](auto chidx) {
-        return !is_whitespace(data[chidx]) && is_whitespace(data[chidx - 1]);
-      });
-    return static_cast<cudf::size_type>(result) + 1;
-  }
-};
-
-}  // namespace
-
-std::unique_ptr<cudf::column> byte_pair_encoding(cudf::strings_column_view const& input,
-                                                 bpe_merge_pairs const& merge_pairs,
-                                                 cudf::string_scalar const& separator,
-                                                 rmm::cuda_stream_view stream,
-                                                 rmm::mr::device_memory_resource* mr)
-{
-  if (input.is_empty() || input.chars_size() == 0)
-    return cudf::make_empty_column(cudf::type_id::STRING);
-
-  auto const d_strings = cudf::column_device_view::create(input.parent(), stream);
-  auto const offsets   = space_offsets(input, *d_strings, stream);
-
-  // build a view using the new offsets and the current input chars column
-  auto const input_view = cudf::column_view(cudf::data_type{cudf::type_id::STRING},
-                                            offsets->size() - 1,
-                                            nullptr,  // no parent data
-                                            nullptr,  // null-mask
-                                            0,        // null-count
-                                            0,        // offset
-                                            {offsets->view(), input.chars()});
-
-  // run BPE on this view
-  auto const bpe_column =
-    byte_pair_encoding(cudf::strings_column_view(input_view), *(merge_pairs.impl), stream);
-
-  // recombine the result:
-  // compute the offsets needed to build a list view
-  auto const list_offsets = [d_strings = *d_strings, stream] {
-    auto offsets_itr = thrust::make_transform_iterator(
-      thrust::make_counting_iterator<cudf::size_type>(0), list_offsets_fn{d_strings});
-    auto offsets_column = std::get<0>(cudf::detail::make_offsets_child_column(
-      offsets_itr, offsets_itr + d_strings.size(), stream, rmm::mr::get_current_device_resource()));
-    return offsets_column;
-  }();
-
-  // build a list column_view using the BPE output and the list_offsets
-  auto const list_join = cudf::column_view(cudf::data_type{cudf::type_id::LIST},
-                                           input.size(),
-                                           nullptr,  // no parent data in list column
-                                           input.null_mask(),
-                                           input.null_count(),
-                                           0,
-                                           {list_offsets->view(), bpe_column->view()});
-
-  // build the output strings column
-  auto result =
-    cudf::strings::detail::join_list_elements(cudf::lists_column_view(list_join),
-                                              separator,
-                                              cudf::string_scalar(""),
-                                              cudf::strings::separator_on_nulls::NO,
-                                              cudf::strings::output_if_empty_list::EMPTY_STRING,
-                                              stream,
-                                              mr);
-  return result;
-}
-
-}  // namespace detail
-
-std::unique_ptr<cudf::column> byte_pair_encoding(cudf::strings_column_view const& input,
-                                                 bpe_merge_pairs const& merges_table,
-                                                 cudf::string_scalar const& separator,
-                                                 rmm::mr::device_memory_resource* mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::byte_pair_encoding(input, merges_table, separator, cudf::get_default_stream(), mr);
-}
-
-}  // namespace nvtext
diff --git a/cpp/src/text/subword/bpe_tokenizer.cuh b/cpp/src/text/subword/bpe_tokenizer.cuh
deleted file mode 100644
index 2fa879ea734..00000000000
--- a/cpp/src/text/subword/bpe_tokenizer.cuh
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <nvtext/bpe_tokenize.hpp>
-
-#include <cudf/column/column.hpp>
-#include <cudf/column/column_device_view.cuh>
-#include <cudf/hashing/detail/hash_allocator.cuh>
-#include <cudf/hashing/detail/murmurhash3_x86_32.cuh>
-#include <cudf/strings/string_view.cuh>
-
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_uvector.hpp>
-#include <rmm/mr/device/polymorphic_allocator.hpp>
-
-#include <cuco/static_map.cuh>
-
-#include <cstdint>
-#include <type_traits>
-
-namespace nvtext {
-namespace detail {
-
-using hash_value_type    = uint32_t;
-using string_hasher_type = cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>;
-
-/**
- * @brief Hasher function used for building and using the cuco static-map
- *
- * This takes advantage of heterogeneous lookup feature in cuco static-map which
- * allows inserting with one type (index) and looking up with a different type (string).
- */
-struct bpe_hasher {
-  cudf::column_device_view const d_strings;
-  string_hasher_type hasher{};
-  // used by insert
-  __device__ hash_value_type operator()(cudf::size_type index) const
-  {
-    return hasher(d_strings.element<cudf::string_view>(index));
-  }
-  // used by find
-  __device__ hash_value_type operator()(cudf::string_view const& s) const { return hasher(s); }
-};
-
-/**
- * @brief Equal function used for building and using the cuco static-map
- *
- * This takes advantage of heterogeneous lookup feature in cuco static-map which
- * allows inserting with one type (index) and looking up with a different type (string).
- */
-struct bpe_equal {
-  cudf::column_device_view const d_strings;
-  // used by insert
-  __device__ bool operator()(cudf::size_type lhs, cudf::size_type rhs) const noexcept
-  {
-    return d_strings.element<cudf::string_view>(lhs) == d_strings.element<cudf::string_view>(rhs);
-  }
-  // used by find
-  __device__ bool operator()(cudf::size_type lhs, cudf::string_view const& rhs) const noexcept
-  {
-    return d_strings.element<cudf::string_view>(lhs) == rhs;
-  }
-};
-
-using hash_table_allocator_type = rmm::mr::stream_allocator_adaptor<default_allocator<char>>;
-
-using probe_scheme = cuco::experimental::linear_probing<1, bpe_hasher>;
-
-using merge_pairs_map_type = cuco::experimental::static_map<cudf::size_type,
-                                                            cudf::size_type,
-                                                            cuco::experimental::extent<std::size_t>,
-                                                            cuda::thread_scope_device,
-                                                            bpe_equal,
-                                                            probe_scheme,
-                                                            hash_table_allocator_type>;
-
-}  // namespace detail
-
-// since column_device_view::create returns is a little more than
-// std::unique_ptr<column_device_view> this helper simplifies the return type in a more maintainable
-// way
-using col_device_view = std::invoke_result_t<decltype(&cudf::column_device_view::create),
-                                             cudf::column_view,
-                                             rmm::cuda_stream_view>;
-
-struct bpe_merge_pairs::bpe_merge_pairs_impl {
-  std::unique_ptr<cudf::column> const merge_pairs;
-  col_device_view const d_merge_pairs;
-  std::unique_ptr<detail::merge_pairs_map_type> merge_pairs_map;
-
-  bpe_merge_pairs_impl(std::unique_ptr<cudf::column>&& merge_pairs,
-                       col_device_view&& d_merge_pairs,
-                       std::unique_ptr<detail::merge_pairs_map_type>&& merge_pairs_map);
-
-  auto const get_merge_pairs() const { return *d_merge_pairs; }
-  auto get_merge_pairs_ref() const { return merge_pairs_map->ref(cuco::experimental::op::find); }
-};
-
-}  // namespace nvtext
diff --git a/cpp/src/text/tokenize.cu b/cpp/src/text/tokenize.cu
index 16b9f25b802..87f6a61a533 100644
--- a/cpp/src/text/tokenize.cu
+++ b/cpp/src/text/tokenize.cu
@@ -232,43 +232,48 @@ std::unique_ptr<cudf::column> character_tokenize(cudf::strings_column_view const
 
 // external APIs
 
-std::unique_ptr<cudf::column> tokenize(cudf::strings_column_view const& strings,
+std::unique_ptr<cudf::column> tokenize(cudf::strings_column_view const& input,
                                        cudf::string_scalar const& delimiter,
+                                       rmm::cuda_stream_view stream,
                                        rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::tokenize(strings, delimiter, cudf::get_default_stream(), mr);
+  return detail::tokenize(input, delimiter, stream, mr);
 }
 
-std::unique_ptr<cudf::column> tokenize(cudf::strings_column_view const& strings,
+std::unique_ptr<cudf::column> tokenize(cudf::strings_column_view const& input,
                                        cudf::strings_column_view const& delimiters,
+                                       rmm::cuda_stream_view stream,
                                        rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::tokenize(strings, delimiters, cudf::get_default_stream(), mr);
+  return detail::tokenize(input, delimiters, stream, mr);
 }
 
-std::unique_ptr<cudf::column> count_tokens(cudf::strings_column_view const& strings,
+std::unique_ptr<cudf::column> count_tokens(cudf::strings_column_view const& input,
                                            cudf::string_scalar const& delimiter,
+                                           rmm::cuda_stream_view stream,
                                            rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::count_tokens(strings, delimiter, cudf::get_default_stream(), mr);
+  return detail::count_tokens(input, delimiter, stream, mr);
 }
 
-std::unique_ptr<cudf::column> count_tokens(cudf::strings_column_view const& strings,
+std::unique_ptr<cudf::column> count_tokens(cudf::strings_column_view const& input,
                                            cudf::strings_column_view const& delimiters,
+                                           rmm::cuda_stream_view stream,
                                            rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::count_tokens(strings, delimiters, cudf::get_default_stream(), mr);
+  return detail::count_tokens(input, delimiters, stream, mr);
 }
 
-std::unique_ptr<cudf::column> character_tokenize(cudf::strings_column_view const& strings,
+std::unique_ptr<cudf::column> character_tokenize(cudf::strings_column_view const& input,
+                                                 rmm::cuda_stream_view stream,
                                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::character_tokenize(strings, cudf::get_default_stream(), mr);
+  return detail::character_tokenize(input, stream, mr);
 }
 
 }  // namespace nvtext
diff --git a/cpp/src/text/vocabulary_tokenize.cu b/cpp/src/text/vocabulary_tokenize.cu
index f998c9ec239..511f1995374 100644
--- a/cpp/src/text/vocabulary_tokenize.cu
+++ b/cpp/src/text/vocabulary_tokenize.cu
@@ -21,10 +21,12 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/get_value.cuh>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/sizes_to_offsets_iterator.cuh>
+#include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/hashing/detail/hash_allocator.cuh>
 #include <cudf/hashing/detail/murmurhash3_x86_32.cuh>
 #include <cudf/strings/string_view.cuh>
@@ -37,6 +39,15 @@
 
 #include <cuco/static_map.cuh>
 
+#include <thrust/copy.h>
+#include <thrust/distance.h>
+#include <thrust/execution_policy.h>
+#include <thrust/functional.h>
+#include <thrust/logical.h>
+#include <thrust/transform.h>
+
+#include <cub/cub.cuh>
+
 namespace nvtext {
 namespace detail {
 namespace {
@@ -162,6 +173,123 @@ std::unique_ptr<tokenize_vocabulary> load_vocabulary(cudf::strings_column_view c
 namespace detail {
 namespace {
 
+/**
+ * @brief Threshold to decide on using string or warp parallel functions.
+ *
+ * If the average byte length of a string in a column exceeds this value then
+ * the warp-parallel function is used to compute the output sizes.
+ * Otherwise, a regular string-parallel function is used.
+ *
+ * This value was found using the vocab_tokenize benchmark results.
+ */
+constexpr cudf::size_type AVG_CHAR_BYTES_THRESHOLD = 128;
+
+constexpr int block_size = 256;
+
+__device__ bool is_delimiter(cudf::string_view const& d_delimiters, cudf::char_utf8 chr)
+{
+  return d_delimiters.empty() ? (chr <= ' ') :  // whitespace check
+           thrust::any_of(thrust::seq,
+                          d_delimiters.begin(),
+                          d_delimiters.end(),
+                          [chr] __device__(cudf::char_utf8 c) { return c == chr; });
+}
+
+struct mark_delimiters_fn {
+  char const* d_chars;
+  cudf::string_view const d_delimiter;
+  int8_t* d_results;
+
+  __device__ void operator()(cudf::size_type idx) const
+  {
+    auto const ptr = d_chars + idx;
+    if (cudf::strings::detail::is_utf8_continuation_char(*ptr)) { return; }
+    cudf::char_utf8 chr = 0;
+    auto ch_size        = cudf::strings::detail::to_char_utf8(ptr, chr);
+    auto const output   = is_delimiter(d_delimiter, chr);
+    while (ch_size > 0) {
+      d_results[idx++] = output;
+      --ch_size;
+    }
+  }
+};
+
+__global__ void token_counts_fn(cudf::column_device_view const d_strings,
+                                cudf::string_view const d_delimiter,
+                                cudf::size_type* d_counts,
+                                int8_t* d_results)
+{
+  // string per warp
+  auto const idx = static_cast<std::size_t>(threadIdx.x + blockIdx.x * blockDim.x);
+  if (idx >= (static_cast<std::size_t>(d_strings.size()) *
+              static_cast<std::size_t>(cudf::detail::warp_size))) {
+    return;
+  }
+  auto const str_idx  = static_cast<cudf::size_type>(idx / cudf::detail::warp_size);
+  auto const lane_idx = static_cast<cudf::size_type>(idx % cudf::detail::warp_size);
+
+  if (d_strings.is_null(str_idx)) {
+    d_counts[str_idx] = 0;
+    return;
+  }
+  auto const d_str = d_strings.element<cudf::string_view>(str_idx);
+  if (d_str.empty()) {
+    d_counts[str_idx] = 0;
+    return;
+  }
+
+  auto const offsets =
+    d_strings.child(cudf::strings_column_view::offsets_column_index).data<cudf::size_type>();
+  auto const offset = offsets[str_idx + d_strings.offset()] - offsets[d_strings.offset()];
+  auto const chars_begin =
+    d_strings.child(cudf::strings_column_view::chars_column_index).data<char>() +
+    offsets[d_strings.offset()];
+
+  auto const begin        = d_str.data();
+  auto const end          = begin + d_str.size_bytes();
+  auto const d_output     = d_results + offset;
+  auto const d_output_end = d_output + d_str.size_bytes();
+
+  using warp_reduce = cub::WarpReduce<cudf::size_type>;
+  __shared__ typename warp_reduce::TempStorage warp_storage;
+
+  cudf::size_type count = 0;
+  if (lane_idx == 0) {
+    cudf::char_utf8 chr = 0;
+    auto ch_size        = cudf::strings::detail::to_char_utf8(begin, chr);
+    auto output         = 1;
+    if (begin > chars_begin) {
+      auto ptr = begin - 1;
+      while (ptr > chars_begin && cudf::strings::detail::is_utf8_continuation_char(*ptr)) {
+        --ptr;
+      }
+      cudf::strings::detail::to_char_utf8(ptr, chr);
+      output = !is_delimiter(d_delimiter, chr);
+    }
+    auto ptr = d_output;
+    while (ch_size > 0) {
+      *ptr++ = output;
+      --ch_size;
+    }
+    count = ((begin + ch_size) == end);
+  }
+  __syncwarp();
+
+  for (auto itr = d_output + lane_idx + 1; itr < d_output_end; itr += cudf::detail::warp_size) {
+    // add one if at the edge of a token or if at the string's end
+    if (*itr) {
+      count += !(*(itr - 1));
+    } else {
+      count += (itr + 1 == d_output_end);
+    }
+  }
+  __syncwarp();
+
+  // add up the counts from the other threads to compute the total token count for this string
+  auto const total_count = warp_reduce(warp_storage).Reduce(count, cub::Sum());
+  if (lane_idx == 0) { d_counts[str_idx] = total_count; }
+}
+
 /**
  * @brief Tokenizes each string and uses the map to assign token id values
  *
@@ -197,6 +325,33 @@ struct vocabulary_tokenizer_fn {
   }
 };
 
+template <typename MapRefType>
+struct transform_tokenizer_fn {
+  cudf::string_view const d_delimiter;
+  MapRefType d_map;
+  cudf::size_type const default_id;
+
+  __device__ cudf::size_type operator()(cudf::string_view d_str) const
+  {
+    auto const begin = d_str.data();
+    auto const end   = begin + d_str.size_bytes();
+
+    auto itr = begin;
+    while (itr < end) {
+      cudf::char_utf8 chr = 0;
+      auto const ch_size  = cudf::strings::detail::to_char_utf8(itr, chr);
+      if (!is_delimiter(d_delimiter, chr)) break;
+      itr += ch_size;
+    }
+
+    auto const size  = static_cast<cudf::size_type>(thrust::distance(itr, end));
+    auto const token = cudf::string_view{itr, size};
+    // lookup token in map
+    auto const fitr = d_map.find(token);
+    return (fitr != d_map.end()) ? fitr->second : default_id;
+  }
+};
+
 }  // namespace
 
 std::unique_ptr<cudf::column> tokenize_with_vocabulary(cudf::strings_column_view const& input,
@@ -209,28 +364,94 @@ std::unique_ptr<cudf::column> tokenize_with_vocabulary(cudf::strings_column_view
   CUDF_EXPECTS(delimiter.is_valid(stream), "Parameter delimiter must be valid");
 
   auto const output_type = cudf::data_type{cudf::type_to_id<cudf::size_type>()};
-  if (input.is_empty()) { return cudf::make_empty_column(output_type); }
+  if (input.size() == input.null_count()) { return cudf::make_empty_column(output_type); }
 
   // count the tokens per string and build the offsets from the counts
   auto const d_strings   = cudf::column_device_view::create(input.parent(), stream);
   auto const d_delimiter = delimiter.value(stream);
-  auto const sizes_itr =
-    cudf::detail::make_counting_transform_iterator(0, strings_tokenizer{*d_strings, d_delimiter});
-  auto [token_offsets, total_count] =
-    cudf::detail::make_offsets_child_column(sizes_itr, sizes_itr + input.size(), stream, mr);
+  auto map_ref           = vocabulary._impl->get_map_ref();
+  auto const zero_itr    = thrust::make_counting_iterator<cudf::size_type>(0);
+
+  if ((input.chars_size() / (input.size() - input.null_count())) < AVG_CHAR_BYTES_THRESHOLD) {
+    auto const sizes_itr =
+      cudf::detail::make_counting_transform_iterator(0, strings_tokenizer{*d_strings, d_delimiter});
+    auto [token_offsets, total_count] =
+      cudf::detail::make_offsets_child_column(sizes_itr, sizes_itr + input.size(), stream, mr);
+
+    // build the output column to hold all the token ids
+    auto tokens = cudf::make_numeric_column(
+      output_type, total_count, cudf::mask_state::UNALLOCATED, stream, mr);
+    auto d_tokens  = tokens->mutable_view().data<cudf::size_type>();
+    auto d_offsets = token_offsets->view().data<cudf::size_type>();
+    vocabulary_tokenizer_fn<decltype(map_ref)> tokenizer{
+      *d_strings, d_delimiter, map_ref, default_id, d_offsets, d_tokens};
+    thrust::for_each_n(rmm::exec_policy(stream), zero_itr, input.size(), tokenizer);
+    return cudf::make_lists_column(input.size(),
+                                   std::move(token_offsets),
+                                   std::move(tokens),
+                                   input.null_count(),
+                                   cudf::detail::copy_bitmask(input.parent(), stream, mr),
+                                   stream,
+                                   mr);
+  }
+
+  // longer strings perform better with warp-parallel approach
+
+  auto const first_offset  = (input.offset() == 0) ? 0
+                                                   : cudf::detail::get_value<cudf::size_type>(
+                                                      input.offsets(), input.offset(), stream);
+  auto const last_offset   = (input.offset() == 0 && input.size() == input.offsets().size() - 1)
+                               ? input.chars().size()
+                               : cudf::detail::get_value<cudf::size_type>(
+                                 input.offsets(), input.size() + input.offset(), stream);
+  auto const chars_size    = last_offset - first_offset;
+  auto const d_input_chars = input.chars().data<char>() + first_offset;
+
+  rmm::device_uvector<cudf::size_type> d_token_counts(input.size(), stream);
+  rmm::device_uvector<int8_t> d_marks(chars_size, stream);
+
+  // mark position of all delimiters
+  thrust::for_each_n(rmm::exec_policy(stream),
+                     zero_itr,
+                     chars_size,
+                     mark_delimiters_fn{d_input_chars, d_delimiter, d_marks.data()});
+
+  // launch warp per string to compute token counts
+  cudf::detail::grid_1d grid{input.size() * cudf::detail::warp_size, block_size};
+  token_counts_fn<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
+    *d_strings, d_delimiter, d_token_counts.data(), d_marks.data());
+  auto [token_offsets, total_count] = cudf::detail::make_offsets_child_column(
+    d_token_counts.begin(), d_token_counts.end(), stream, mr);
+
+  rmm::device_uvector<cudf::size_type> d_tmp_offsets(total_count + 1, stream);
+  d_tmp_offsets.set_element(total_count, chars_size, stream);
+  thrust::copy_if(rmm::exec_policy(stream),
+                  zero_itr,
+                  thrust::counting_iterator<cudf::size_type>(chars_size),
+                  d_tmp_offsets.begin(),
+                  [d_marks = d_marks.data()] __device__(auto idx) {
+                    if (idx == 0) return true;
+                    return d_marks[idx] && !d_marks[idx - 1];
+                  });
+
+  auto tmp_offsets =
+    std::make_unique<cudf::column>(std::move(d_tmp_offsets), rmm::device_buffer{}, 0);
+  auto tmp_chars = cudf::column_view(input.chars().type(), chars_size, d_input_chars, nullptr, 0);
+  auto const tmp_input = cudf::column_view(
+    input.parent().type(), total_count, nullptr, nullptr, 0, 0, {tmp_offsets->view(), tmp_chars});
+
+  auto const d_tmp_strings = cudf::column_device_view::create(tmp_input, stream);
 
-  // build the output column to hold all the token ids
   auto tokens =
     cudf::make_numeric_column(output_type, total_count, cudf::mask_state::UNALLOCATED, stream, mr);
-  auto map_ref   = vocabulary._impl->get_map_ref();
-  auto d_offsets = token_offsets->view().data<cudf::size_type>();
-  auto d_tokens  = tokens->mutable_view().data<cudf::size_type>();
-  vocabulary_tokenizer_fn<decltype(map_ref)> tokenizer{
-    *d_strings, d_delimiter, map_ref, default_id, d_offsets, d_tokens};
-  thrust::for_each_n(rmm::exec_policy(stream),
-                     thrust::make_counting_iterator<cudf::size_type>(0),
-                     input.size(),
-                     tokenizer);
+  auto d_tokens = tokens->mutable_view().data<cudf::size_type>();
+
+  transform_tokenizer_fn<decltype(map_ref)> tokenizer{d_delimiter, map_ref, default_id};
+  thrust::transform(rmm::exec_policy(stream),
+                    d_tmp_strings->begin<cudf::string_view>(),
+                    d_tmp_strings->end<cudf::string_view>(),
+                    d_tokens,
+                    tokenizer);
 
   return cudf::make_lists_column(input.size(),
                                  std::move(token_offsets),
diff --git a/cpp/src/unary/cast_ops.cu b/cpp/src/unary/cast_ops.cu
index 1c81f266200..8421f32056e 100644
--- a/cpp/src/unary/cast_ops.cu
+++ b/cpp/src/unary/cast_ops.cu
@@ -194,7 +194,7 @@ std::unique_ptr<column> rescale(column_view input,
       auto const scalar  = make_fixed_point_scalar<T>(0, scale_type{scale}, stream);
       auto output_column = make_column_from_scalar(*scalar, input.size(), stream, mr);
       if (input.nullable()) {
-        auto const null_mask = copy_bitmask(input, stream, mr);
+        auto const null_mask = detail::copy_bitmask(input, stream, mr);
         output_column->set_null_mask(std::move(null_mask), input.null_count());
       }
       return output_column;
@@ -255,7 +255,7 @@ struct dispatch_unary_cast_to {
       std::make_unique<column>(type,
                                size,
                                rmm::device_buffer{size * cudf::size_of(type), stream, mr},
-                               copy_bitmask(input, stream, mr),
+                               detail::copy_bitmask(input, stream, mr),
                                input.null_count());
 
     mutable_column_view output_mutable = *output;
@@ -285,7 +285,7 @@ struct dispatch_unary_cast_to {
       std::make_unique<column>(type,
                                size,
                                rmm::device_buffer{size * cudf::size_of(type), stream, mr},
-                               copy_bitmask(input, stream, mr),
+                               detail::copy_bitmask(input, stream, mr),
                                input.null_count());
 
     mutable_column_view output_mutable = *output;
@@ -334,7 +334,7 @@ struct dispatch_unary_cast_to {
       auto output     = std::make_unique<column>(cudf::data_type{type.id(), input.type().scale()},
                                              size,
                                              rmm::device_buffer{size * cudf::size_of(type), stream},
-                                             copy_bitmask(input, stream, mr),
+                                             detail::copy_bitmask(input, stream, mr),
                                              input.null_count());
 
       mutable_column_view output_mutable = *output;
@@ -415,10 +415,11 @@ std::unique_ptr<column> cast(column_view const& input,
 
 std::unique_ptr<column> cast(column_view const& input,
                              data_type type,
+                             rmm::cuda_stream_view stream,
                              rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::cast(input, type, cudf::get_default_stream(), mr);
+  return detail::cast(input, type, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/unary/math_ops.cu b/cpp/src/unary/math_ops.cu
index d0cae81a9c8..88922362319 100644
--- a/cpp/src/unary/math_ops.cu
+++ b/cpp/src/unary/math_ops.cu
@@ -291,8 +291,12 @@ std::unique_ptr<column> unary_op_with(column_view const& input,
        std::is_same_v<FixedPointUnaryOpFunctor, fixed_point_floor<Type>>))
     return std::make_unique<cudf::column>(input, stream, mr);
 
-  auto result = cudf::make_fixed_width_column(
-    input.type(), input.size(), copy_bitmask(input, stream, mr), input.null_count(), stream, mr);
+  auto result = cudf::make_fixed_width_column(input.type(),
+                                              input.size(),
+                                              detail::copy_bitmask(input, stream, mr),
+                                              input.null_count(),
+                                              stream,
+                                              mr);
 
   auto out_view = result->mutable_view();
 
@@ -642,10 +646,11 @@ std::unique_ptr<cudf::column> unary_operation(cudf::column_view const& input,
 
 std::unique_ptr<cudf::column> unary_operation(cudf::column_view const& input,
                                               cudf::unary_operator op,
+                                              rmm::cuda_stream_view stream,
                                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::unary_operation(input, op, cudf::get_default_stream(), mr);
+  return detail::unary_operation(input, op, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/unary/nan_ops.cu b/cpp/src/unary/nan_ops.cu
index 2cf83466b03..092ad3b6731 100644
--- a/cpp/src/unary/nan_ops.cu
+++ b/cpp/src/unary/nan_ops.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -91,17 +91,20 @@ std::unique_ptr<column> is_not_nan(cudf::column_view const& input,
 
 }  // namespace detail
 
-std::unique_ptr<column> is_nan(cudf::column_view const& input, rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> is_nan(cudf::column_view const& input,
+                               rmm::cuda_stream_view stream,
+                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::is_nan(input, cudf::get_default_stream(), mr);
+  return detail::is_nan(input, stream, mr);
 }
 
 std::unique_ptr<column> is_not_nan(cudf::column_view const& input,
+                                   rmm::cuda_stream_view stream,
                                    rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::is_not_nan(input, cudf::get_default_stream(), mr);
+  return detail::is_not_nan(input, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/unary/null_ops.cu b/cpp/src/unary/null_ops.cu
index e64c68fdae6..6bdd65dd42d 100644
--- a/cpp/src/unary/null_ops.cu
+++ b/cpp/src/unary/null_ops.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -55,17 +55,20 @@ std::unique_ptr<column> is_valid(cudf::column_view const& input,
 
 }  // namespace detail
 
-std::unique_ptr<column> is_null(cudf::column_view const& input, rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> is_null(cudf::column_view const& input,
+                                rmm::cuda_stream_view stream,
+                                rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::is_null(input, cudf::get_default_stream(), mr);
+  return detail::is_null(input, stream, mr);
 }
 
 std::unique_ptr<column> is_valid(cudf::column_view const& input,
+                                 rmm::cuda_stream_view stream,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::is_valid(input, cudf::get_default_stream(), mr);
+  return detail::is_valid(input, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/utilities/traits.cpp b/cpp/src/utilities/traits.cpp
index bc10dd7845a..b0078ff85a2 100644
--- a/cpp/src/utilities/traits.cpp
+++ b/cpp/src/utilities/traits.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -158,6 +158,19 @@ struct is_integral_impl {
 
 bool is_integral(data_type type) { return cudf::type_dispatcher(type, is_integral_impl{}); }
 
+struct is_integral_not_bool_impl {
+  template <typename T>
+  constexpr bool operator()()
+  {
+    return is_integral_not_bool<T>();
+  }
+};
+
+bool is_integral_not_bool(data_type type)
+{
+  return cudf::type_dispatcher(type, is_integral_not_bool_impl{});
+}
+
 struct is_floating_point_impl {
   template <typename T>
   constexpr bool operator()()
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 04939f3cd6d..1be8566fb0f 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -357,6 +357,7 @@ ConfigureTest(
 ConfigureTest(
   UTILITIES_TEST
   utilities_tests/type_list_tests.cpp
+  utilities_tests/column_debug_tests.cpp
   utilities_tests/column_utilities_tests.cpp
   utilities_tests/column_wrapper_tests.cpp
   utilities_tests/lists_column_wrapper_tests.cpp
@@ -392,6 +393,7 @@ set_tests_properties(
 ConfigureTest(
   ITERATOR_TEST
   iterator/indexalator_test.cu
+  iterator/offsetalator_test.cu
   iterator/optional_iterator_test_chrono.cu
   iterator/optional_iterator_test_numeric.cu
   iterator/pair_iterator_test_chrono.cu
@@ -522,7 +524,6 @@ ConfigureTest(
   strings/format_lists_tests.cpp
   strings/integers_tests.cpp
   strings/ipv4_tests.cpp
-  strings/json_tests.cpp
   strings/like_tests.cpp
   strings/pad_tests.cpp
   strings/repeat_strings_tests.cpp
@@ -536,6 +537,10 @@ ConfigureTest(
   strings/urls_tests.cpp
 )
 
+# ##################################################################################################
+# * json path test --------------------------------------------------------------------------------
+ConfigureTest(JSON_PATH_TEST json/json_tests.cpp)
+
 # ##################################################################################################
 # * structs test ----------------------------------------------------------------------------------
 ConfigureTest(STRUCTS_TEST structs/structs_column_tests.cpp structs/utilities_tests.cpp)
@@ -616,27 +621,53 @@ ConfigureTest(
 # * bin tests ----------------------------------------------------------------------------------
 ConfigureTest(LABEL_BINS_TEST labeling/label_bins_tests.cpp)
 
+# ##################################################################################################
+# * jit tests ----------------------------------------------------------------------------------
+ConfigureTest(JIT_PARSER_TEST jit/parse_ptx_function.cpp)
+target_include_directories(JIT_PARSER_TEST PRIVATE "$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}/src>")
+
 # ##################################################################################################
 # * stream testing ---------------------------------------------------------------------------------
 ConfigureTest(
   STREAM_IDENTIFICATION_TEST identify_stream_usage/test_default_stream_identification.cu
 )
 
+ConfigureTest(STREAM_BINARYOP_TEST streams/binaryop_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_CONCATENATE_TEST streams/concatenate_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_COPYING_TEST streams/copying_test.cpp STREAM_MODE testing)
+ConfigureTest(STREAM_CSVIO_TEST streams/io/csv_test.cpp STREAM_MODE testing)
+ConfigureTest(STREAM_DICTIONARY_TEST streams/dictionary_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_FILLING_TEST streams/filling_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_GROUPBY_TEST streams/groupby_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_HASHING_TEST streams/hash_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_INTEROP_TEST streams/interop_test.cpp STREAM_MODE testing)
+ConfigureTest(STREAM_JSONIO_TEST streams/io/json_test.cpp STREAM_MODE testing)
+ConfigureTest(STREAM_LISTS_TEST streams/lists_test.cpp STREAM_MODE testing)
+ConfigureTest(STREAM_NULL_MASK_TEST streams/null_mask_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_REPLACE_TEST streams/replace_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_SEARCH_TEST streams/search_test.cpp STREAM_MODE testing)
-ConfigureTest(STREAM_DICTIONARY_TEST streams/dictionary_test.cpp STREAM_MODE testing)
+ConfigureTest(STREAM_SORTING_TEST streams/sorting_test.cpp STREAM_MODE testing)
 ConfigureTest(
-  STREAM_STRINGS_TEST streams/strings/case_test.cpp streams/strings/find_test.cpp STREAM_MODE
+  STREAM_STRINGS_TEST
+  streams/strings/case_test.cpp
+  streams/strings/combine_test.cpp
+  streams/strings/contains_test.cpp
+  streams/strings/convert_test.cpp
+  streams/strings/extract_test.cpp
+  streams/strings/filter_test.cpp
+  streams/strings/find_test.cpp
+  streams/strings/replace_test.cpp
+  streams/strings/reverse_test.cpp
+  streams/strings/split_test.cpp
+  streams/strings/strings_tests.cpp
+  STREAM_MODE
   testing
 )
-ConfigureTest(STREAM_SORTING_TEST streams/sorting_test.cpp STREAM_MODE testing)
-ConfigureTest(STREAM_TEXT_TEST streams/text/ngrams_test.cpp STREAM_MODE testing)
+ConfigureTest(
+  STREAM_TEXT_TEST streams/text/ngrams_test.cpp streams/text/replace_test.cpp
+  streams/text/tokenize_test.cpp STREAM_MODE testing
+)
+ConfigureTest(STREAM_UNARY_TEST streams/unary_test.cpp STREAM_MODE testing)
 
 # ##################################################################################################
 # Install tests ####################################################################################
diff --git a/cpp/tests/ast/transform_tests.cpp b/cpp/tests/ast/transform_tests.cpp
index c0109a40cec..624a781c5b9 100644
--- a/cpp/tests/ast/transform_tests.cpp
+++ b/cpp/tests/ast/transform_tests.cpp
@@ -316,6 +316,33 @@ TEST_F(TransformTest, ImbalancedTreeArithmetic)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view(), verbosity);
 }
 
+TEST_F(TransformTest, ImbalancedTreeArithmeticDeep)
+{
+  auto c_0   = column_wrapper<int64_t>{4, 5, 6};
+  auto table = cudf::table_view{{c_0}};
+
+  auto col_ref_0 = cudf::ast::column_reference(0);
+
+  // expression: (c0 < c0) == (c0 < (c0 + c0))
+  //              {false, false, false} == (c0 < {8, 10, 12})
+  //              {false, false, false} == {true, true, true}
+  //              {false, false, false}
+  auto expression_left_subtree =
+    cudf::ast::operation(cudf::ast::ast_operator::LESS, col_ref_0, col_ref_0);
+  auto expression_right_inner_subtree =
+    cudf::ast::operation(cudf::ast::ast_operator::ADD, col_ref_0, col_ref_0);
+  auto expression_right_subtree =
+    cudf::ast::operation(cudf::ast::ast_operator::LESS, col_ref_0, expression_right_inner_subtree);
+
+  auto expression_tree = cudf::ast::operation(
+    cudf::ast::ast_operator::EQUAL, expression_left_subtree, expression_right_subtree);
+
+  auto result   = cudf::compute_column(table, expression_tree);
+  auto expected = column_wrapper<bool>{false, false, false};
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view(), verbosity);
+}
+
 TEST_F(TransformTest, MultiLevelTreeComparator)
 {
   auto c_0   = column_wrapper<int32_t>{3, 20, 1, 50};
diff --git a/cpp/tests/groupby/histogram_tests.cpp b/cpp/tests/groupby/histogram_tests.cpp
index c5833f40cf2..612486d8e5c 100644
--- a/cpp/tests/groupby/histogram_tests.cpp
+++ b/cpp/tests/groupby/histogram_tests.cpp
@@ -67,6 +67,7 @@ auto groupby_histogram(cudf::column_view const& keys,
   auto sorted_histograms = cudf::lists::sort_lists(cudf::lists_column_view{*sorted_vals},
                                                    cudf::order::ASCENDING,
                                                    cudf::null_order::BEFORE,
+                                                   cudf::get_default_stream(),
                                                    rmm::mr::get_current_device_resource());
 
   return std::pair{std::move(sorted_keys), std::move(sorted_histograms)};
diff --git a/cpp/tests/groupby/structs_tests.cpp b/cpp/tests/groupby/structs_tests.cpp
index f85fc6335f6..af6f613d344 100644
--- a/cpp/tests/groupby/structs_tests.cpp
+++ b/cpp/tests/groupby/structs_tests.cpp
@@ -18,6 +18,7 @@
 
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/debug_utilities.hpp>
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
diff --git a/cpp/tests/interop/arrow_utils.hpp b/cpp/tests/interop/arrow_utils.hpp
index fc8f5b37f7e..2c5f7458ce5 100644
--- a/cpp/tests/interop/arrow_utils.hpp
+++ b/cpp/tests/interop/arrow_utils.hpp
@@ -186,7 +186,7 @@ template <typename T>
   auto constexpr BIT_WIDTH_RATIO = sizeof(__int128_t) / sizeof(T);
 
   std::shared_ptr<arrow::Array> arr;
-  arrow::Decimal128Builder decimal_builder(arrow::decimal(18, -scale),
+  arrow::Decimal128Builder decimal_builder(arrow::decimal(cudf::detail::max_precision<T>(), -scale),
                                            arrow::default_memory_pool());
 
   for (T i = 0; i < static_cast<T>(data.size() / BIT_WIDTH_RATIO); ++i) {
diff --git a/cpp/tests/interop/to_arrow_test.cpp b/cpp/tests/interop/to_arrow_test.cpp
index 6bb4cdfd747..d6762e70d80 100644
--- a/cpp/tests/interop/to_arrow_test.cpp
+++ b/cpp/tests/interop/to_arrow_test.cpp
@@ -604,7 +604,9 @@ struct ToArrowDecimalScalarTest : public cudf::test::BaseFixture {};
 TEST_F(ToArrowDecimalScalarTest, Basic)
 {
   auto const value{42};
-  auto const precision{18};  // cudf will convert to the widest-precision Arrow scalar of the type
+  auto const precision =
+    cudf::detail::max_precision<__int128_t>();  // cudf will convert to the widest-precision Arrow
+                                                // scalar of the type
   int32_t const scale{4};
 
   auto const cudf_scalar =
diff --git a/cpp/tests/io/fst/logical_stack_test.cu b/cpp/tests/io/fst/logical_stack_test.cu
index 3d6743702b8..20b8674a717 100644
--- a/cpp/tests/io/fst/logical_stack_test.cu
+++ b/cpp/tests/io/fst/logical_stack_test.cu
@@ -216,14 +216,15 @@ TEST_F(LogicalStackTest, GroundTruth)
                                 stream.value()));
 
   // Run algorithm
-  fst::sparse_stack_op_to_top_of_stack<StackLevelT>(d_stack_ops.data(),
-                                                    d_stack_op_idx_span,
-                                                    JSONToStackOp{},
-                                                    top_of_stack_gpu.device_ptr(),
-                                                    empty_stack_symbol,
-                                                    read_symbol,
-                                                    string_size,
-                                                    stream.value());
+  fst::sparse_stack_op_to_top_of_stack<fst::stack_op_support::NO_RESET_SUPPORT, StackLevelT>(
+    d_stack_ops.data(),
+    d_stack_op_idx_span,
+    JSONToStackOp{},
+    top_of_stack_gpu.device_ptr(),
+    empty_stack_symbol,
+    read_symbol,
+    string_size,
+    stream.value());
 
   // Async copy results from device to host
   top_of_stack_gpu.device_to_host_async(stream_view);
diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp
index 7c911ac2e04..a2db2d69984 100644
--- a/cpp/tests/io/json_test.cpp
+++ b/cpp/tests/io/json_test.cpp
@@ -18,6 +18,7 @@
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/cudf_gtest.hpp>
+#include <cudf_test/default_stream.hpp>
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/table_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
@@ -1422,7 +1423,9 @@ TEST_F(JsonReaderTest, JsonLongString)
                            .lines(true)
                            .na_rep("null");
 
-  cudf::io::write_json(options_builder.build(), rmm::mr::get_current_device_resource());
+  cudf::io::write_json(options_builder.build(),
+                       cudf::test::get_default_stream(),
+                       rmm::mr::get_current_device_resource());
 
   cudf::table_view const expected = tbl_view;
   std::map<std::string, data_type> types;
@@ -1957,12 +1960,36 @@ TEST_F(JsonReaderTest, JSONLinesRecovering)
     // 2 -> (invalid)
     R"({"b":{"a":[321})"
     "\n"
-    // 3 -> c: [1] (valid)
+    // 3 -> c: 1.2 (valid)
     R"({"c":1.2})"
     "\n"
     "\n"
-    // 4 -> a: 123 (valid)
-    R"({"a":123})";
+    // 4 -> a: 4 (valid)
+    R"({"a":4})"
+    "\n"
+    // 5 -> (invalid)
+    R"({"a":5)"
+    "\n"
+    // 6 -> (invalid)
+    R"({"a":6 )"
+    "\n"
+    // 7 -> (invalid)
+    R"({"b":[7 )"
+    "\n"
+    // 8 -> a: 8 (valid)
+    R"({"a":8})"
+    "\n"
+    // 9 -> (invalid)
+    R"({"d":{"unterminated_field_name)"
+    "\n"
+    // 10 -> (invalid)
+    R"({"d":{)"
+    "\n"
+    // 11 -> (invalid)
+    R"({"d":{"123",)"
+    "\n"
+    // 12 -> a: 12 (valid)
+    R"({"a":12})";
 
   auto filepath = temp_env->get_temp_dir() + "RecoveringLines.json";
   {
@@ -1978,17 +2005,89 @@ TEST_F(JsonReaderTest, JSONLinesRecovering)
   cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
   EXPECT_EQ(result.tbl->num_columns(), 2);
-  EXPECT_EQ(result.tbl->num_rows(), 5);
+  EXPECT_EQ(result.tbl->num_rows(), 13);
   EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::INT64);
   EXPECT_EQ(result.tbl->get_column(1).type().id(), cudf::type_id::FLOAT64);
 
-  std::vector<bool> a_validity{true, false, false, false, true};
-  std::vector<bool> c_validity{false, false, false, true, false};
+  std::vector<bool> a_validity{
+    true, false, false, false, true, false, false, false, true, false, false, false, true};
+  std::vector<bool> c_validity{
+    false, false, false, true, false, false, false, false, false, false, false, false, false};
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    result.tbl->get_column(0),
+    int64_wrapper{{-2, 0, 0, 0, 4, 0, 0, 0, 8, 0, 0, 0, 12}, a_validity.cbegin()});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    result.tbl->get_column(1),
+    float64_wrapper{{0.0, 0.0, 0.0, 1.2, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0},
+                    c_validity.cbegin()});
+}
+
+TEST_F(JsonReaderTest, JSONLinesRecoveringIgnoreExcessChars)
+{
+  /**
+   * @brief Spark has the specific need to ignore extra characters that come after the first record
+   * on a JSON line
+   */
+  std::string data =
+    // 0 -> a: -2 (valid)
+    R"({"a":-2}{})"
+    "\n"
+    // 1 -> (invalid)
+    R"({"b":{}should_be_invalid})"
+    "\n"
+    // 2 -> b (valid)
+    R"({"b":{"a":3} })"
+    "\n"
+    // 3 -> c: (valid)
+    R"({"c":1.2 } )"
+    "\n"
+    "\n"
+    // 4 -> (valid)
+    R"({"a":4} 123)"
+    "\n"
+    // 5 -> (valid)
+    R"({"a":5}//Comment after record)"
+    "\n"
+    // 6 -> (valid)
+    R"({"a":6} //Comment after whitespace)"
+    "\n"
+    // 7 -> (invalid)
+    R"({"a":5 //Invalid Comment within record})";
+
+  auto filepath = temp_env->get_temp_dir() + "RecoveringLinesExcessChars.json";
+  {
+    std::ofstream outfile(filepath, std::ofstream::out);
+    outfile << data;
+  }
+
+  cudf::io::json_reader_options in_options =
+    cudf::io::json_reader_options::builder(cudf::io::source_info{filepath})
+      .lines(true)
+      .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL);
+
+  cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+
+  EXPECT_EQ(result.tbl->num_columns(), 3);
+  EXPECT_EQ(result.tbl->num_rows(), 8);
+  EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::INT64);
+  EXPECT_EQ(result.tbl->get_column(1).type().id(), cudf::type_id::STRUCT);
+  EXPECT_EQ(result.tbl->get_column(2).type().id(), cudf::type_id::FLOAT64);
+
+  std::vector<bool> a_validity{true, false, false, false, true, true, true, false};
+  std::vector<bool> b_validity{false, false, true, false, false, false, false, false};
+  std::vector<bool> c_validity{false, false, false, true, false, false, false, false};
+
+  // Child column b->a
+  auto b_a_col = int64_wrapper({0, 0, 3, 0, 0, 0, 0, 0});
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0),
-                                 int64_wrapper{{-2, 0, 0, 0, 123}, a_validity.cbegin()});
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1),
-                                 float64_wrapper{{0.0, 0.0, 0.0, 1.2, 0.0}, c_validity.cbegin()});
+                                 int64_wrapper{{-2, 0, 0, 0, 4, 5, 6, 0}, a_validity.cbegin()});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    result.tbl->get_column(1), cudf::test::structs_column_wrapper({b_a_col}, b_validity.cbegin()));
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    result.tbl->get_column(2),
+    float64_wrapper{{0.0, 0.0, 0.0, 1.2, 0.0, 0.0, 0.0, 0.0}, c_validity.cbegin()});
 }
 
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/io/json_writer.cpp b/cpp/tests/io/json_writer.cpp
index 3a4074c02ad..a85a696565b 100644
--- a/cpp/tests/io/json_writer.cpp
+++ b/cpp/tests/io/json_writer.cpp
@@ -16,6 +16,7 @@
 
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
 #include <cudf_test/iterator_utilities.hpp>
 
 #include <cudf/detail/iterator.cuh>
@@ -49,14 +50,16 @@ TEST_F(JsonWriterTest, EmptyInput)
                        .build();
 
   // Empty columns in table
-  cudf::io::write_json(out_options, rmm::mr::get_current_device_resource());
+  cudf::io::write_json(
+    out_options, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
   std::string const expected = R"([])";
   EXPECT_EQ(expected, std::string(out_buffer.data(), out_buffer.size()));
 
   // Empty columns in table - JSON Lines
   out_buffer.clear();
   out_options.enable_lines(true);
-  cudf::io::write_json(out_options, rmm::mr::get_current_device_resource());
+  cudf::io::write_json(
+    out_options, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
   std::string const expected_lines = "\n";
   EXPECT_EQ(expected_lines, std::string(out_buffer.data(), out_buffer.size()));
 
@@ -64,7 +67,8 @@ TEST_F(JsonWriterTest, EmptyInput)
   cudf::table_view tbl_view2{};
   out_options.set_table(tbl_view2);
   out_buffer.clear();
-  cudf::io::write_json(out_options, rmm::mr::get_current_device_resource());
+  cudf::io::write_json(
+    out_options, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
   EXPECT_EQ(expected_lines, std::string(out_buffer.data(), out_buffer.size()));
 }
 
@@ -89,17 +93,22 @@ TEST_F(JsonWriterTest, ErrorCases)
                        .build();
 
   // not enough column names
-  EXPECT_THROW(cudf::io::write_json(out_options, rmm::mr::get_current_device_resource()),
-               cudf::logic_error);
+  EXPECT_THROW(
+    cudf::io::write_json(
+      out_options, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()),
+    cudf::logic_error);
 
   mt.schema_info.emplace_back("int16");
   out_options.set_metadata(mt);
-  EXPECT_NO_THROW(cudf::io::write_json(out_options, rmm::mr::get_current_device_resource()));
+  EXPECT_NO_THROW(cudf::io::write_json(
+    out_options, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()));
 
   // chunk_rows must be at least 8
   out_options.set_rows_per_chunk(0);
-  EXPECT_THROW(cudf::io::write_json(out_options, rmm::mr::get_current_device_resource()),
-               cudf::logic_error);
+  EXPECT_THROW(
+    cudf::io::write_json(
+      out_options, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()),
+    cudf::logic_error);
 }
 
 TEST_F(JsonWriterTest, PlainTable)
@@ -121,7 +130,9 @@ TEST_F(JsonWriterTest, PlainTable)
                            .lines(false)
                            .na_rep("null");
 
-  cudf::io::write_json(options_builder.build(), rmm::mr::get_current_device_resource());
+  cudf::io::write_json(options_builder.build(),
+                       cudf::test::get_default_stream(),
+                       rmm::mr::get_current_device_resource());
 
   std::string const expected =
     R"([{"col1":"a","col2":"d","int":1,"float":1.5,"int16":null},{"col1":"b","col2":"e","int":2,"float":2.5,"int16":2},{"col1":"c","col2":"f","int":3,"float":3.5,"int16":null}])";
@@ -151,7 +162,9 @@ TEST_F(JsonWriterTest, SimpleNested)
                            .lines(true)
                            .na_rep("null");
 
-  cudf::io::write_json(options_builder.build(), rmm::mr::get_current_device_resource());
+  cudf::io::write_json(options_builder.build(),
+                       cudf::test::get_default_stream(),
+                       rmm::mr::get_current_device_resource());
   std::string const expected = R"({"a":1,"b":2,"c":{"d":3},"f":5.5,"g":[1]}
 {"a":6,"b":7,"c":{"d":8},"f":10.5}
 {"a":1,"b":2,"c":{"e":4},"f":5.5,"g":[2,null]}
@@ -183,7 +196,9 @@ TEST_F(JsonWriterTest, MixedNested)
                            .lines(false)
                            .na_rep("null");
 
-  cudf::io::write_json(options_builder.build(), rmm::mr::get_current_device_resource());
+  cudf::io::write_json(options_builder.build(),
+                       cudf::test::get_default_stream(),
+                       rmm::mr::get_current_device_resource());
   std::string const expected =
     R"([{"a":1,"b":2,"c":{"d":[3]},"f":5.5,"g":[{"h":1}]},)"
     R"({"a":6,"b":7,"c":{"d":[8]},"f":10.5},)"
@@ -216,7 +231,8 @@ TEST_F(JsonWriterTest, WriteReadNested)
                        .na_rep("null")
                        .build();
 
-  cudf::io::write_json(out_options, rmm::mr::get_current_device_resource());
+  cudf::io::write_json(
+    out_options, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
   std::string const expected = R"({"a":1,"b":2,"c":{"d":3},"f":5.5,"g":[1]}
 {"a":6,"b":7,"c":{"d":8},"f":10.5}
 {"a":1,"b":2,"c":{"e":4},"f":5.5,"g":[2,null]}
@@ -291,7 +307,8 @@ TEST_F(JsonWriterTest, WriteReadNested)
   mt.schema_info[2].children.clear();
   out_options.set_metadata(mt);
   out_buffer.clear();
-  cudf::io::write_json(out_options, rmm::mr::get_current_device_resource());
+  cudf::io::write_json(
+    out_options, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
 
   in_options = cudf::io::json_reader_options::builder(
                  cudf::io::source_info{out_buffer.data(), out_buffer.size()})
@@ -314,7 +331,8 @@ TEST_F(JsonWriterTest, WriteReadNested)
   // without column names
   out_options.set_metadata(cudf::io::table_metadata{});
   out_buffer.clear();
-  cudf::io::write_json(out_options, rmm::mr::get_current_device_resource());
+  cudf::io::write_json(
+    out_options, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
   in_options = cudf::io::json_reader_options::builder(
                  cudf::io::source_info{out_buffer.data(), out_buffer.size()})
                  .lines(true)
@@ -352,7 +370,8 @@ TEST_F(JsonWriterTest, SpecialChars)
                        .na_rep("null")
                        .build();
 
-  cudf::io::write_json(out_options, rmm::mr::get_current_device_resource());
+  cudf::io::write_json(
+    out_options, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
   std::string const expected = R"({"\"a\"":1,"'b'":"abcd"}
 {"\"a\"":6,"'b'":"b\b\f\n\r\t"}
 {"\"a\"":1,"'b'":"\"c\""}
@@ -385,7 +404,9 @@ TEST_F(JsonWriterTest, NullList)
                            .lines(true)
                            .na_rep("null");
 
-  cudf::io::write_json(options_builder.build(), rmm::mr::get_current_device_resource());
+  cudf::io::write_json(options_builder.build(),
+                       cudf::test::get_default_stream(),
+                       rmm::mr::get_current_device_resource());
   std::string const expected = R"({"a":[null],"b":[[1,2,3],[null],[null,null,null],[4,null,5]]}
 {"a":[2,null,null,3],"b":null}
 {"a":[null,null,4],"b":[[2,null],null]}
@@ -424,7 +445,9 @@ TEST_F(JsonWriterTest, ChunkedNested)
                            .na_rep("null")
                            .rows_per_chunk(8);
 
-  cudf::io::write_json(options_builder.build(), rmm::mr::get_current_device_resource());
+  cudf::io::write_json(options_builder.build(),
+                       cudf::test::get_default_stream(),
+                       rmm::mr::get_current_device_resource());
   std::string const expected =
     R"({"a":1,"b":-2,"c":{},"e":[{"f":1}]}
 {"a":2,"b":-2,"c":{}}
@@ -480,7 +503,9 @@ TEST_F(JsonWriterTest, StructAllNullCombinations)
                            .lines(true)
                            .na_rep("null");
 
-  cudf::io::write_json(options_builder.build(), rmm::mr::get_current_device_resource());
+  cudf::io::write_json(options_builder.build(),
+                       cudf::test::get_default_stream(),
+                       rmm::mr::get_current_device_resource());
   std::string const expected = R"({}
 {"e":1}
 {"d":1}
@@ -542,7 +567,9 @@ TEST_F(JsonWriterTest, Unicode)
                            .lines(true)
                            .na_rep("null");
 
-  cudf::io::write_json(options_builder.build(), rmm::mr::get_current_device_resource());
+  cudf::io::write_json(options_builder.build(),
+                       cudf::test::get_default_stream(),
+                       rmm::mr::get_current_device_resource());
 
   std::string const expected =
     R"({"col1":"\"\\\/\b\f\n\r\t","col2":"C\u10ae\u226a\u31f3\u434f\u51f9\u6ca6\u738b\u8fbf\u9fb8\ua057\ubbdc\uc2a4\ud3f6\ue4fe\ufd20","int16":null}
diff --git a/cpp/tests/io/nested_json_test.cpp b/cpp/tests/io/nested_json_test.cpp
index 00d657108b8..b0ffbe3d154 100644
--- a/cpp/tests/io/nested_json_test.cpp
+++ b/cpp/tests/io/nested_json_test.cpp
@@ -285,6 +285,123 @@ TEST_F(JsonTest, StackContextRecovering)
   CUDF_TEST_EXPECT_VECTOR_EQUAL(golden_stack_context, stack_context, stack_context.size());
 }
 
+TEST_F(JsonTest, StackContextRecoveringFuzz)
+{
+  // Type used to represent the atomic symbol type used within the finite-state machine
+  using SymbolT      = char;
+  using StackSymbolT = char;
+
+  std::random_device rd;
+  std::mt19937 gen(42);
+  std::uniform_int_distribution<int> distribution(0, 4);
+  constexpr std::size_t input_length = 1024 * 1024;
+  std::string input{};
+  input.reserve(input_length);
+
+  bool inside_quotes = false;
+  std::stack<StackSymbolT> host_stack{};
+  for (std::size_t i = 0; i < input_length; ++i) {
+    bool is_ok = true;
+    char current{};
+    do {
+      int rand_char = distribution(gen);
+      is_ok         = true;
+      switch (rand_char) {
+        case 0: current = '{'; break;
+        case 1: current = '['; break;
+        case 2: current = '}'; break;
+        case 3: current = '"'; break;
+        case 4: current = '\n'; break;
+      }
+      switch (current) {
+        case '"': inside_quotes = !inside_quotes; break;
+        case '{':
+          if (!inside_quotes) { host_stack.push('{'); }
+          break;
+        case '[':
+          if (!inside_quotes) { host_stack.push('['); }
+          break;
+        case '}':
+          if (!inside_quotes) {
+            if (host_stack.size() > 0) {
+              // Get the proper 'pop' stack symbol
+              current = (host_stack.top() == '{' ? '}' : ']');
+              host_stack.pop();
+            } else
+              is_ok = false;
+          }
+          break;
+        case '\n':
+          // Increase chance to have longer lines
+          if (distribution(gen) == 0) {
+            is_ok = false;
+            break;
+          } else {
+            host_stack    = {};
+            inside_quotes = false;
+            break;
+          }
+      }
+    } while (!is_ok);
+    input += current;
+  }
+
+  std::string expected_stack_context{};
+  expected_stack_context.reserve(input_length);
+  inside_quotes = false;
+  host_stack    = std::stack<StackSymbolT>{};
+  for (auto const current : input) {
+    // Write the stack context for the current input symbol
+    if (host_stack.empty()) {
+      expected_stack_context += '_';
+    } else {
+      expected_stack_context += host_stack.top();
+    }
+
+    switch (current) {
+      case '"': inside_quotes = !inside_quotes; break;
+      case '{':
+        if (!inside_quotes) { host_stack.push('{'); }
+        break;
+      case '[':
+        if (!inside_quotes) { host_stack.push('['); }
+        break;
+      case '}':
+        if (!inside_quotes && host_stack.size() > 0) { host_stack.pop(); }
+        break;
+      case ']':
+        if (!inside_quotes && host_stack.size() > 0) { host_stack.pop(); }
+        break;
+      case '\n':
+        host_stack    = {};
+        inside_quotes = false;
+        break;
+    }
+  }
+
+  // Prepare cuda stream for data transfers & kernels
+  auto const stream = cudf::get_default_stream();
+
+  // Prepare input & output buffers
+  cudf::string_scalar const d_scalar(input, true, stream);
+  auto const d_input =
+    cudf::device_span<SymbolT const>{d_scalar.data(), static_cast<size_t>(d_scalar.size())};
+  cudf::detail::hostdevice_vector<StackSymbolT> stack_context(input.size(), stream);
+
+  // Run algorithm
+  constexpr auto stack_behavior = cuio_json::stack_behavior_t::ResetOnDelimiter;
+  cuio_json::detail::get_stack_context(d_input, stack_context.device_ptr(), stack_behavior, stream);
+
+  // Copy back the results
+  stack_context.device_to_host_async(stream);
+
+  // Make sure we copied back the stack context
+  stream.synchronize();
+
+  ASSERT_EQ(expected_stack_context.size(), stack_context.size());
+  CUDF_TEST_EXPECT_VECTOR_EQUAL(expected_stack_context, stack_context, stack_context.size());
+}
+
 TEST_F(JsonTest, TokenStream)
 {
   using cuio_json::PdaTokenT;
@@ -543,7 +660,7 @@ TEST_F(JsonTest, RecoveringTokenStream)
 {
   // Test input. Inline comments used to indicate character indexes
   //                           012345678 <= line 0
-  std::string const input = R"({"a":-2},)"
+  std::string const input = R"({"a":2 {})"
                             // 9
                             "\n"
                             // 01234 <= line 1
@@ -569,23 +686,12 @@ TEST_F(JsonTest, RecoveringTokenStream)
     // Line 0 (invalid)
     {0, token_t::StructBegin},
     {0, token_t::StructEnd},
-    // Line 1 (valid)
-    {10, token_t::StructBegin},
-    {11, token_t::StructMemberBegin},
-    {11, token_t::FieldNameBegin},
-    {13, token_t::FieldNameEnd},
-    // Line 2 (valid)
-    {16, token_t::StructBegin},
-    {17, token_t::StructMemberBegin},
-    {17, token_t::FieldNameBegin},
-    {19, token_t::FieldNameEnd},
-    {21, token_t::StructBegin},
-    {22, token_t::StructMemberBegin},
-    {22, token_t::FieldNameBegin},
-    {24, token_t::FieldNameEnd},
-    {26, token_t::ListBegin},
-    {27, token_t::ValueBegin},
-    {30, token_t::ValueEnd},
+    // Line 1 (invalid)
+    {0, token_t::StructBegin},
+    {0, token_t::StructEnd},
+    // Line 2 (invalid)
+    {0, token_t::StructBegin},
+    {0, token_t::StructEnd},
     // Line 3 (valid)
     {31, token_t::StructBegin},
     {32, token_t::StructMemberBegin},
diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp
index 890ef914713..dca3886db14 100644
--- a/cpp/tests/io/orc_test.cpp
+++ b/cpp/tests/io/orc_test.cpp
@@ -1054,8 +1054,12 @@ TEST_F(OrcStatisticsTest, Basic)
     EXPECT_EQ(*ts4.maximum, 3);
     EXPECT_EQ(*ts4.minimum_utc, -4);
     EXPECT_EQ(*ts4.maximum_utc, 3);
-    EXPECT_EQ(*ts4.minimum_nanos, 999994);
-    EXPECT_EQ(*ts4.maximum_nanos, 6);
+    // nanosecond precision can't be included until we write a writer version that includes ORC-135
+    // see https://github.com/rapidsai/cudf/issues/14325
+    // EXPECT_EQ(*ts4.minimum_nanos, 999994);
+    EXPECT_FALSE(ts4.minimum_nanos.has_value());
+    // EXPECT_EQ(*ts4.maximum_nanos, 6);
+    EXPECT_FALSE(ts4.maximum_nanos.has_value());
 
     auto& s5 = stats[5];
     EXPECT_EQ(*s5.number_of_values, 4ul);
@@ -1065,8 +1069,12 @@ TEST_F(OrcStatisticsTest, Basic)
     EXPECT_EQ(*ts5.maximum, 3000);
     EXPECT_EQ(*ts5.minimum_utc, -3001);
     EXPECT_EQ(*ts5.maximum_utc, 3000);
-    EXPECT_EQ(*ts5.minimum_nanos, 994000);
-    EXPECT_EQ(*ts5.maximum_nanos, 6000);
+    // nanosecond precision can't be included until we write a writer version that includes ORC-135
+    // see https://github.com/rapidsai/cudf/issues/14325
+    // EXPECT_EQ(*ts5.minimum_nanos, 994000);
+    EXPECT_FALSE(ts5.minimum_nanos.has_value());
+    // EXPECT_EQ(*ts5.maximum_nanos, 6000);
+    EXPECT_FALSE(ts5.maximum_nanos.has_value());
 
     auto& s6 = stats[6];
     EXPECT_EQ(*s6.number_of_values, 4ul);
@@ -1299,20 +1307,16 @@ TEST_F(OrcStatisticsTest, Overflow)
 
 TEST_F(OrcStatisticsTest, HasNull)
 {
-  // This test can now be implemented with libcudf; keeping the pyorc version to keep the test
+  // This test can now be implemented with libcudf; keeping the pandas version to keep the test
   // inputs diversified
   // Method to create file:
-  // >>> import pyorc
-  // >>> output = open("./temp.orc", "wb")
-  // >>> writer = pyorc.Writer(output, pyorc.Struct(a=pyorc.BigInt(), b=pyorc.BigInt()))
-  // >>> writer.write((1, 3))
-  // >>> writer.write((2, 4))
-  // >>> writer.write((None, 5))
-  // >>> writer.close()
+  // >>> import pandas as pd
+  // >>> df = pd.DataFrame({'a':pd.Series([1, 2, None], dtype="Int64"), 'b':[3, 4, 5]})
+  // >>> df.to_orc("temp.orc")
   //
   // Contents of file:
   // >>> import pyarrow.orc as po
-  // >>> po.ORCFile('new.orc').read()
+  // >>> po.ORCFile('temp.orc').read()
   // pyarrow.Table
   // a: int64
   // b: int64
@@ -1934,4 +1938,34 @@ TEST_F(OrcStatisticsTest, AllNulls)
   check_all_null_stats<cudf::io::string_statistics>(stats.file_stats[3]);
 }
 
+TEST_F(OrcWriterTest, UnorderedDictionary)
+{
+  std::vector<char const*> strings{
+    "BBBB", "BBBB", "CCCC", "BBBB", "CCCC", "EEEE", "CCCC", "AAAA", "DDDD", "EEEE"};
+  str_col col(strings.begin(), strings.end());
+
+  table_view expected({col});
+
+  std::vector<char> out_buffer_sorted;
+  cudf::io::orc_writer_options out_opts_sorted =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{&out_buffer_sorted}, expected);
+  cudf::io::write_orc(out_opts_sorted);
+
+  cudf::io::orc_reader_options in_opts_sorted = cudf::io::orc_reader_options::builder(
+    cudf::io::source_info{out_buffer_sorted.data(), out_buffer_sorted.size()});
+  auto const from_sorted = cudf::io::read_orc(in_opts_sorted).tbl;
+
+  std::vector<char> out_buffer_unsorted;
+  cudf::io::orc_writer_options out_opts_unsorted =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{&out_buffer_unsorted}, expected)
+      .enable_dictionary_sort(false);
+  cudf::io::write_orc(out_opts_unsorted);
+
+  cudf::io::orc_reader_options in_opts_unsorted = cudf::io::orc_reader_options::builder(
+    cudf::io::source_info{out_buffer_unsorted.data(), out_buffer_unsorted.size()});
+  auto const from_unsorted = cudf::io::read_orc(in_opts_unsorted).tbl;
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(*from_sorted, *from_unsorted);
+}
+
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp
index 81e0e12eeb9..fece83f891b 100644
--- a/cpp/tests/io/parquet_test.cpp
+++ b/cpp/tests/io/parquet_test.cpp
@@ -200,29 +200,30 @@ std::unique_ptr<cudf::column> make_parquet_list_list_col(
 // of the file to populate the FileMetaData pointed to by file_meta_data.
 // throws cudf::logic_error if the file or metadata is invalid.
 void read_footer(std::unique_ptr<cudf::io::datasource> const& source,
-                 cudf::io::parquet::FileMetaData* file_meta_data)
+                 cudf::io::parquet::detail::FileMetaData* file_meta_data)
 {
-  constexpr auto header_len = sizeof(cudf::io::parquet::file_header_s);
-  constexpr auto ender_len  = sizeof(cudf::io::parquet::file_ender_s);
+  constexpr auto header_len = sizeof(cudf::io::parquet::detail::file_header_s);
+  constexpr auto ender_len  = sizeof(cudf::io::parquet::detail::file_ender_s);
 
   auto const len           = source->size();
   auto const header_buffer = source->host_read(0, header_len);
   auto const header =
-    reinterpret_cast<cudf::io::parquet::file_header_s const*>(header_buffer->data());
+    reinterpret_cast<cudf::io::parquet::detail::file_header_s const*>(header_buffer->data());
   auto const ender_buffer = source->host_read(len - ender_len, ender_len);
-  auto const ender = reinterpret_cast<cudf::io::parquet::file_ender_s const*>(ender_buffer->data());
+  auto const ender =
+    reinterpret_cast<cudf::io::parquet::detail::file_ender_s const*>(ender_buffer->data());
 
   // checks for valid header, footer, and file length
   ASSERT_GT(len, header_len + ender_len);
-  ASSERT_TRUE(header->magic == cudf::io::parquet::parquet_magic &&
-              ender->magic == cudf::io::parquet::parquet_magic);
+  ASSERT_TRUE(header->magic == cudf::io::parquet::detail::parquet_magic &&
+              ender->magic == cudf::io::parquet::detail::parquet_magic);
   ASSERT_TRUE(ender->footer_len != 0 && ender->footer_len <= (len - header_len - ender_len));
 
   // parquet files end with 4-byte footer_length and 4-byte magic == "PAR1"
   // seek backwards from the end of the file (footer_length + 8 bytes of ender)
   auto const footer_buffer =
     source->host_read(len - ender->footer_len - ender_len, ender->footer_len);
-  cudf::io::parquet::CompactProtocolReader cp(footer_buffer->data(), ender->footer_len);
+  cudf::io::parquet::detail::CompactProtocolReader cp(footer_buffer->data(), ender->footer_len);
 
   // returns true on success
   bool res = cp.read(file_meta_data);
@@ -233,14 +234,14 @@ void read_footer(std::unique_ptr<cudf::io::datasource> const& source,
 // this assumes the data is uncompressed.
 // throws cudf::logic_error if the page_loc data is invalid.
 int read_dict_bits(std::unique_ptr<cudf::io::datasource> const& source,
-                   cudf::io::parquet::PageLocation const& page_loc)
+                   cudf::io::parquet::detail::PageLocation const& page_loc)
 {
   CUDF_EXPECTS(page_loc.offset > 0, "Cannot find page header");
   CUDF_EXPECTS(page_loc.compressed_page_size > 0, "Invalid page header length");
 
-  cudf::io::parquet::PageHeader page_hdr;
+  cudf::io::parquet::detail::PageHeader page_hdr;
   auto const page_buf = source->host_read(page_loc.offset, page_loc.compressed_page_size);
-  cudf::io::parquet::CompactProtocolReader cp(page_buf->data(), page_buf->size());
+  cudf::io::parquet::detail::CompactProtocolReader cp(page_buf->data(), page_buf->size());
   bool res = cp.read(&page_hdr);
   CUDF_EXPECTS(res, "Cannot parse page header");
 
@@ -252,15 +253,16 @@ int read_dict_bits(std::unique_ptr<cudf::io::datasource> const& source,
 // read column index from datasource at location indicated by chunk,
 // parse and return as a ColumnIndex struct.
 // throws cudf::logic_error if the chunk data is invalid.
-cudf::io::parquet::ColumnIndex read_column_index(
-  std::unique_ptr<cudf::io::datasource> const& source, cudf::io::parquet::ColumnChunk const& chunk)
+cudf::io::parquet::detail::ColumnIndex read_column_index(
+  std::unique_ptr<cudf::io::datasource> const& source,
+  cudf::io::parquet::detail::ColumnChunk const& chunk)
 {
   CUDF_EXPECTS(chunk.column_index_offset > 0, "Cannot find column index");
   CUDF_EXPECTS(chunk.column_index_length > 0, "Invalid column index length");
 
-  cudf::io::parquet::ColumnIndex colidx;
+  cudf::io::parquet::detail::ColumnIndex colidx;
   auto const ci_buf = source->host_read(chunk.column_index_offset, chunk.column_index_length);
-  cudf::io::parquet::CompactProtocolReader cp(ci_buf->data(), ci_buf->size());
+  cudf::io::parquet::detail::CompactProtocolReader cp(ci_buf->data(), ci_buf->size());
   bool res = cp.read(&colidx);
   CUDF_EXPECTS(res, "Cannot parse column index");
   return colidx;
@@ -269,22 +271,24 @@ cudf::io::parquet::ColumnIndex read_column_index(
 // read offset index from datasource at location indicated by chunk,
 // parse and return as an OffsetIndex struct.
 // throws cudf::logic_error if the chunk data is invalid.
-cudf::io::parquet::OffsetIndex read_offset_index(
-  std::unique_ptr<cudf::io::datasource> const& source, cudf::io::parquet::ColumnChunk const& chunk)
+cudf::io::parquet::detail::OffsetIndex read_offset_index(
+  std::unique_ptr<cudf::io::datasource> const& source,
+  cudf::io::parquet::detail::ColumnChunk const& chunk)
 {
   CUDF_EXPECTS(chunk.offset_index_offset > 0, "Cannot find offset index");
   CUDF_EXPECTS(chunk.offset_index_length > 0, "Invalid offset index length");
 
-  cudf::io::parquet::OffsetIndex offidx;
+  cudf::io::parquet::detail::OffsetIndex offidx;
   auto const oi_buf = source->host_read(chunk.offset_index_offset, chunk.offset_index_length);
-  cudf::io::parquet::CompactProtocolReader cp(oi_buf->data(), oi_buf->size());
+  cudf::io::parquet::detail::CompactProtocolReader cp(oi_buf->data(), oi_buf->size());
   bool res = cp.read(&offidx);
   CUDF_EXPECTS(res, "Cannot parse offset index");
   return offidx;
 }
 
 // Return as a Statistics from the column chunk
-cudf::io::parquet::Statistics const& get_statistics(cudf::io::parquet::ColumnChunk const& chunk)
+cudf::io::parquet::detail::Statistics const& get_statistics(
+  cudf::io::parquet::detail::ColumnChunk const& chunk)
 {
   return chunk.meta_data.statistics;
 }
@@ -292,15 +296,16 @@ cudf::io::parquet::Statistics const& get_statistics(cudf::io::parquet::ColumnChu
 // read page header from datasource at location indicated by page_loc,
 // parse and return as a PageHeader struct.
 // throws cudf::logic_error if the page_loc data is invalid.
-cudf::io::parquet::PageHeader read_page_header(std::unique_ptr<cudf::io::datasource> const& source,
-                                               cudf::io::parquet::PageLocation const& page_loc)
+cudf::io::parquet::detail::PageHeader read_page_header(
+  std::unique_ptr<cudf::io::datasource> const& source,
+  cudf::io::parquet::detail::PageLocation const& page_loc)
 {
   CUDF_EXPECTS(page_loc.offset > 0, "Cannot find page header");
   CUDF_EXPECTS(page_loc.compressed_page_size > 0, "Invalid page header length");
 
-  cudf::io::parquet::PageHeader page_hdr;
+  cudf::io::parquet::detail::PageHeader page_hdr;
   auto const page_buf = source->host_read(page_loc.offset, page_loc.compressed_page_size);
-  cudf::io::parquet::CompactProtocolReader cp(page_buf->data(), page_buf->size());
+  cudf::io::parquet::detail::CompactProtocolReader cp(page_buf->data(), page_buf->size());
   bool res = cp.read(&page_hdr);
   CUDF_EXPECTS(res, "Cannot parse page header");
   return page_hdr;
@@ -348,6 +353,9 @@ struct ParquetWriterSchemaTest : public ParquetWriterTest {
 template <typename T>
 struct ParquetReaderSourceTest : public ParquetReaderTest {};
 
+template <typename T>
+struct ParquetWriterDeltaTest : public ParquetWriterTest {};
+
 // Declare typed test cases
 // TODO: Replace with `NumericTypes` when unsigned support is added. Issue #5352
 using SupportedTypes = cudf::test::Types<int8_t, int16_t, int32_t, int64_t, bool, float, double>;
@@ -379,7 +387,6 @@ TYPED_TEST_SUITE(ParquetChunkedWriterNumericTypeTest, SupportedTypes);
 class ParquetSizedTest : public ::cudf::test::BaseFixtureWithParam<int> {};
 
 // test the allowed bit widths for dictionary encoding
-// values chosen to trigger 1, 2, 3, 4, 5, 6, 8, 10, 12, 16, 20, and 24 bit dictionaries
 INSTANTIATE_TEST_SUITE_P(ParquetDictionaryTest,
                          ParquetSizedTest,
                          testing::Range(1, 25),
@@ -3686,7 +3693,7 @@ TEST_F(ParquetWriterTest, CheckPageRows)
 
   // check first page header and make sure it has only page_rows values
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
   ASSERT_GT(fmd.row_groups.size(), 0);
@@ -3697,7 +3704,7 @@ TEST_F(ParquetWriterTest, CheckPageRows)
   // read first data page header.  sizeof(PageHeader) is not exact, but the thrift encoded
   // version should be smaller than size of the struct.
   auto const ph = read_page_header(
-    source, {first_chunk.data_page_offset, sizeof(cudf::io::parquet::PageHeader), 0});
+    source, {first_chunk.data_page_offset, sizeof(cudf::io::parquet::detail::PageHeader), 0});
 
   EXPECT_EQ(ph.data_page_header.num_values, page_rows);
 }
@@ -3722,7 +3729,7 @@ TEST_F(ParquetWriterTest, CheckPageRowsAdjusted)
 
   // check first page header and make sure it has only page_rows values
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
   ASSERT_GT(fmd.row_groups.size(), 0);
@@ -3733,7 +3740,7 @@ TEST_F(ParquetWriterTest, CheckPageRowsAdjusted)
   // read first data page header.  sizeof(PageHeader) is not exact, but the thrift encoded
   // version should be smaller than size of the struct.
   auto const ph = read_page_header(
-    source, {first_chunk.data_page_offset, sizeof(cudf::io::parquet::PageHeader), 0});
+    source, {first_chunk.data_page_offset, sizeof(cudf::io::parquet::detail::PageHeader), 0});
 
   EXPECT_LE(ph.data_page_header.num_values, rows_per_page);
 }
@@ -3759,7 +3766,7 @@ TEST_F(ParquetWriterTest, CheckPageRowsTooSmall)
 
   // check that file is written correctly when rows/page < fragment size
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
   ASSERT_TRUE(fmd.row_groups.size() > 0);
@@ -3770,7 +3777,7 @@ TEST_F(ParquetWriterTest, CheckPageRowsTooSmall)
   // read first data page header.  sizeof(PageHeader) is not exact, but the thrift encoded
   // version should be smaller than size of the struct.
   auto const ph = read_page_header(
-    source, {first_chunk.data_page_offset, sizeof(cudf::io::parquet::PageHeader), 0});
+    source, {first_chunk.data_page_offset, sizeof(cudf::io::parquet::detail::PageHeader), 0});
 
   // there should be only one page since the fragment size is larger than rows_per_page
   EXPECT_EQ(ph.data_page_header.num_values, num_rows);
@@ -3798,7 +3805,7 @@ TEST_F(ParquetWriterTest, Decimal128Stats)
   cudf::io::write_parquet(out_opts);
 
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
 
@@ -4031,7 +4038,7 @@ TYPED_TEST(ParquetWriterComparableTypeTest, ThreeColumnSorted)
   cudf::io::write_parquet(out_opts);
 
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
   ASSERT_GT(fmd.row_groups.size(), 0);
@@ -4041,10 +4048,10 @@ TYPED_TEST(ParquetWriterComparableTypeTest, ThreeColumnSorted)
 
   // now check that the boundary order for chunk 1 is ascending,
   // chunk 2 is descending, and chunk 3 is unordered
-  cudf::io::parquet::BoundaryOrder expected_orders[] = {
-    cudf::io::parquet::BoundaryOrder::ASCENDING,
-    cudf::io::parquet::BoundaryOrder::DESCENDING,
-    cudf::io::parquet::BoundaryOrder::UNORDERED};
+  cudf::io::parquet::detail::BoundaryOrder expected_orders[] = {
+    cudf::io::parquet::detail::BoundaryOrder::ASCENDING,
+    cudf::io::parquet::detail::BoundaryOrder::DESCENDING,
+    cudf::io::parquet::detail::BoundaryOrder::UNORDERED};
 
   for (std::size_t i = 0; i < columns.size(); i++) {
     auto const ci = read_column_index(source, columns[i]);
@@ -4067,15 +4074,16 @@ int32_t compare(T& v1, T& v2)
 // 1 if v1 > v2.
 int32_t compare_binary(std::vector<uint8_t> const& v1,
                        std::vector<uint8_t> const& v2,
-                       cudf::io::parquet::Type ptype,
-                       cudf::io::parquet::ConvertedType ctype)
+                       cudf::io::parquet::detail::Type ptype,
+                       thrust::optional<cudf::io::parquet::detail::ConvertedType> const& ctype)
 {
+  auto ctype_val = ctype.value_or(cudf::io::parquet::detail::UNKNOWN);
   switch (ptype) {
-    case cudf::io::parquet::INT32:
-      switch (ctype) {
-        case cudf::io::parquet::UINT_8:
-        case cudf::io::parquet::UINT_16:
-        case cudf::io::parquet::UINT_32:
+    case cudf::io::parquet::detail::INT32:
+      switch (ctype_val) {
+        case cudf::io::parquet::detail::UINT_8:
+        case cudf::io::parquet::detail::UINT_16:
+        case cudf::io::parquet::detail::UINT_32:
           return compare(*(reinterpret_cast<uint32_t const*>(v1.data())),
                          *(reinterpret_cast<uint32_t const*>(v2.data())));
         default:
@@ -4083,23 +4091,23 @@ int32_t compare_binary(std::vector<uint8_t> const& v1,
                          *(reinterpret_cast<int32_t const*>(v2.data())));
       }
 
-    case cudf::io::parquet::INT64:
-      if (ctype == cudf::io::parquet::UINT_64) {
+    case cudf::io::parquet::detail::INT64:
+      if (ctype_val == cudf::io::parquet::detail::UINT_64) {
         return compare(*(reinterpret_cast<uint64_t const*>(v1.data())),
                        *(reinterpret_cast<uint64_t const*>(v2.data())));
       }
       return compare(*(reinterpret_cast<int64_t const*>(v1.data())),
                      *(reinterpret_cast<int64_t const*>(v2.data())));
 
-    case cudf::io::parquet::FLOAT:
+    case cudf::io::parquet::detail::FLOAT:
       return compare(*(reinterpret_cast<float const*>(v1.data())),
                      *(reinterpret_cast<float const*>(v2.data())));
 
-    case cudf::io::parquet::DOUBLE:
+    case cudf::io::parquet::detail::DOUBLE:
       return compare(*(reinterpret_cast<double const*>(v1.data())),
                      *(reinterpret_cast<double const*>(v2.data())));
 
-    case cudf::io::parquet::BYTE_ARRAY: {
+    case cudf::io::parquet::detail::BYTE_ARRAY: {
       int32_t v1sz = v1.size();
       int32_t v2sz = v2.size();
       int32_t ret  = memcmp(v1.data(), v2.data(), std::min(v1sz, v2sz));
@@ -4142,7 +4150,7 @@ TEST_P(ParquetV2Test, LargeColumnIndex)
   cudf::io::write_parquet(out_opts);
 
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
 
@@ -4156,18 +4164,20 @@ TEST_P(ParquetV2Test, LargeColumnIndex)
       // check trunc(page.min) <= stats.min && trun(page.max) >= stats.max
       auto const ptype = fmd.schema[c + 1].type;
       auto const ctype = fmd.schema[c + 1].converted_type;
-      EXPECT_TRUE(compare_binary(ci.min_values[0], stats.min_value, ptype, ctype) <= 0);
-      EXPECT_TRUE(compare_binary(ci.max_values[0], stats.max_value, ptype, ctype) >= 0);
+      ASSERT_TRUE(stats.min_value.has_value());
+      ASSERT_TRUE(stats.max_value.has_value());
+      EXPECT_TRUE(compare_binary(ci.min_values[0], stats.min_value.value(), ptype, ctype) <= 0);
+      EXPECT_TRUE(compare_binary(ci.max_values[0], stats.max_value.value(), ptype, ctype) >= 0);
     }
   }
 }
 
 TEST_P(ParquetV2Test, CheckColumnOffsetIndex)
 {
-  constexpr auto num_rows = 100000;
-  auto const is_v2        = GetParam();
-  auto const expected_hdr_type =
-    is_v2 ? cudf::io::parquet::PageType::DATA_PAGE_V2 : cudf::io::parquet::PageType::DATA_PAGE;
+  constexpr auto num_rows      = 100000;
+  auto const is_v2             = GetParam();
+  auto const expected_hdr_type = is_v2 ? cudf::io::parquet::detail::PageType::DATA_PAGE_V2
+                                       : cudf::io::parquet::detail::PageType::DATA_PAGE;
 
   // fixed length strings
   auto str1_elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) {
@@ -4210,7 +4220,7 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndex)
   cudf::io::write_parquet(out_opts);
 
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
 
@@ -4237,6 +4247,9 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndex)
       auto const ci    = read_column_index(source, chunk);
       auto const stats = get_statistics(chunk);
 
+      ASSERT_TRUE(stats.min_value.has_value());
+      ASSERT_TRUE(stats.max_value.has_value());
+
       // schema indexing starts at 1
       auto const ptype = fmd.schema[c + 1].type;
       auto const ctype = fmd.schema[c + 1].converted_type;
@@ -4245,20 +4258,20 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndex)
         EXPECT_FALSE(ci.null_pages[p]);
         // null_counts should always be 0
         EXPECT_EQ(ci.null_counts[p], 0);
-        EXPECT_TRUE(compare_binary(stats.min_value, ci.min_values[p], ptype, ctype) <= 0);
+        EXPECT_TRUE(compare_binary(stats.min_value.value(), ci.min_values[p], ptype, ctype) <= 0);
       }
       for (size_t p = 0; p < ci.max_values.size(); p++)
-        EXPECT_TRUE(compare_binary(stats.max_value, ci.max_values[p], ptype, ctype) >= 0);
+        EXPECT_TRUE(compare_binary(stats.max_value.value(), ci.max_values[p], ptype, ctype) >= 0);
     }
   }
 }
 
 TEST_P(ParquetV2Test, CheckColumnOffsetIndexNulls)
 {
-  constexpr auto num_rows = 100000;
-  auto const is_v2        = GetParam();
-  auto const expected_hdr_type =
-    is_v2 ? cudf::io::parquet::PageType::DATA_PAGE_V2 : cudf::io::parquet::PageType::DATA_PAGE;
+  constexpr auto num_rows      = 100000;
+  auto const is_v2             = GetParam();
+  auto const expected_hdr_type = is_v2 ? cudf::io::parquet::detail::PageType::DATA_PAGE_V2
+                                       : cudf::io::parquet::detail::PageType::DATA_PAGE;
 
   // fixed length strings
   auto str1_elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) {
@@ -4311,7 +4324,7 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNulls)
   cudf::io::write_parquet(out_opts);
 
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
 
@@ -4339,7 +4352,10 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNulls)
       auto const stats = get_statistics(chunk);
 
       // should be half nulls, except no nulls in column 0
-      EXPECT_EQ(stats.null_count, c == 0 ? 0 : num_rows / 2);
+      ASSERT_TRUE(stats.min_value.has_value());
+      ASSERT_TRUE(stats.max_value.has_value());
+      ASSERT_TRUE(stats.null_count.has_value());
+      EXPECT_EQ(stats.null_count.value(), c == 0 ? 0 : num_rows / 2);
 
       // schema indexing starts at 1
       auto const ptype = fmd.schema[c + 1].type;
@@ -4351,10 +4367,10 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNulls)
         } else {
           EXPECT_EQ(ci.null_counts[p], 0);
         }
-        EXPECT_TRUE(compare_binary(stats.min_value, ci.min_values[p], ptype, ctype) <= 0);
+        EXPECT_TRUE(compare_binary(stats.min_value.value(), ci.min_values[p], ptype, ctype) <= 0);
       }
       for (size_t p = 0; p < ci.max_values.size(); p++) {
-        EXPECT_TRUE(compare_binary(stats.max_value, ci.max_values[p], ptype, ctype) >= 0);
+        EXPECT_TRUE(compare_binary(stats.max_value.value(), ci.max_values[p], ptype, ctype) >= 0);
       }
     }
   }
@@ -4362,10 +4378,10 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNulls)
 
 TEST_P(ParquetV2Test, CheckColumnOffsetIndexNullColumn)
 {
-  constexpr auto num_rows = 100000;
-  auto const is_v2        = GetParam();
-  auto const expected_hdr_type =
-    is_v2 ? cudf::io::parquet::PageType::DATA_PAGE_V2 : cudf::io::parquet::PageType::DATA_PAGE;
+  constexpr auto num_rows      = 100000;
+  auto const is_v2             = GetParam();
+  auto const expected_hdr_type = is_v2 ? cudf::io::parquet::detail::PageType::DATA_PAGE_V2
+                                       : cudf::io::parquet::detail::PageType::DATA_PAGE;
 
   // fixed length strings
   auto str1_elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) {
@@ -4403,7 +4419,7 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNullColumn)
   cudf::io::write_parquet(out_opts);
 
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
 
@@ -4431,7 +4447,12 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNullColumn)
       auto const stats = get_statistics(chunk);
 
       // there should be no nulls except column 1 which is all nulls
-      EXPECT_EQ(stats.null_count, c == 1 ? num_rows : 0);
+      if (c != 1) {
+        ASSERT_TRUE(stats.min_value.has_value());
+        ASSERT_TRUE(stats.max_value.has_value());
+      }
+      ASSERT_TRUE(stats.null_count.has_value());
+      EXPECT_EQ(stats.null_count.value(), c == 1 ? num_rows : 0);
 
       // schema indexing starts at 1
       auto const ptype = fmd.schema[c + 1].type;
@@ -4444,12 +4465,12 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNullColumn)
         }
         if (not ci.null_pages[p]) {
           EXPECT_EQ(ci.null_counts[p], 0);
-          EXPECT_TRUE(compare_binary(stats.min_value, ci.min_values[p], ptype, ctype) <= 0);
+          EXPECT_TRUE(compare_binary(stats.min_value.value(), ci.min_values[p], ptype, ctype) <= 0);
         }
       }
       for (size_t p = 0; p < ci.max_values.size(); p++) {
         if (not ci.null_pages[p]) {
-          EXPECT_TRUE(compare_binary(stats.max_value, ci.max_values[p], ptype, ctype) >= 0);
+          EXPECT_TRUE(compare_binary(stats.max_value.value(), ci.max_values[p], ptype, ctype) >= 0);
         }
       }
     }
@@ -4458,9 +4479,9 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNullColumn)
 
 TEST_P(ParquetV2Test, CheckColumnOffsetIndexStruct)
 {
-  auto const is_v2 = GetParam();
-  auto const expected_hdr_type =
-    is_v2 ? cudf::io::parquet::PageType::DATA_PAGE_V2 : cudf::io::parquet::PageType::DATA_PAGE;
+  auto const is_v2             = GetParam();
+  auto const expected_hdr_type = is_v2 ? cudf::io::parquet::detail::PageType::DATA_PAGE_V2
+                                       : cudf::io::parquet::detail::PageType::DATA_PAGE;
 
   auto c0 = testdata::ascending<uint32_t>();
 
@@ -4495,7 +4516,7 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexStruct)
   cudf::io::write_parquet(out_opts);
 
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
 
@@ -4528,13 +4549,16 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexStruct)
       auto const ci    = read_column_index(source, chunk);
       auto const stats = get_statistics(chunk);
 
+      ASSERT_TRUE(stats.min_value.has_value());
+      ASSERT_TRUE(stats.max_value.has_value());
+
       auto const ptype = fmd.schema[colidx].type;
       auto const ctype = fmd.schema[colidx].converted_type;
       for (size_t p = 0; p < ci.min_values.size(); p++) {
-        EXPECT_TRUE(compare_binary(stats.min_value, ci.min_values[p], ptype, ctype) <= 0);
+        EXPECT_TRUE(compare_binary(stats.min_value.value(), ci.min_values[p], ptype, ctype) <= 0);
       }
       for (size_t p = 0; p < ci.max_values.size(); p++) {
-        EXPECT_TRUE(compare_binary(stats.max_value, ci.max_values[p], ptype, ctype) >= 0);
+        EXPECT_TRUE(compare_binary(stats.max_value.value(), ci.max_values[p], ptype, ctype) >= 0);
       }
     }
   }
@@ -4542,9 +4566,9 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexStruct)
 
 TEST_P(ParquetV2Test, CheckColumnOffsetIndexStructNulls)
 {
-  auto const is_v2 = GetParam();
-  auto const expected_hdr_type =
-    is_v2 ? cudf::io::parquet::PageType::DATA_PAGE_V2 : cudf::io::parquet::PageType::DATA_PAGE;
+  auto const is_v2             = GetParam();
+  auto const expected_hdr_type = is_v2 ? cudf::io::parquet::detail::PageType::DATA_PAGE_V2
+                                       : cudf::io::parquet::detail::PageType::DATA_PAGE;
 
   auto validity2 =
     cudf::detail::make_counting_transform_iterator(0, [](cudf::size_type i) { return i % 2; });
@@ -4586,7 +4610,7 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexStructNulls)
   cudf::io::write_parquet(out_opts);
 
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
 
@@ -4616,9 +4640,9 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexStructNulls)
 
 TEST_P(ParquetV2Test, CheckColumnIndexListWithNulls)
 {
-  auto const is_v2 = GetParam();
-  auto const expected_hdr_type =
-    is_v2 ? cudf::io::parquet::PageType::DATA_PAGE_V2 : cudf::io::parquet::PageType::DATA_PAGE;
+  auto const is_v2             = GetParam();
+  auto const expected_hdr_type = is_v2 ? cudf::io::parquet::detail::PageType::DATA_PAGE_V2
+                                       : cudf::io::parquet::detail::PageType::DATA_PAGE;
 
   using cudf::test::iterators::null_at;
   using cudf::test::iterators::nulls_at;
@@ -4711,7 +4735,7 @@ TEST_P(ParquetV2Test, CheckColumnIndexListWithNulls)
   cudf::io::write_parquet(out_opts);
 
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
 
@@ -4812,7 +4836,7 @@ TEST_F(ParquetWriterTest, CheckColumnIndexTruncation)
   cudf::io::write_parquet(out_opts);
 
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
 
@@ -4824,11 +4848,14 @@ TEST_F(ParquetWriterTest, CheckColumnIndexTruncation)
       auto const ci    = read_column_index(source, chunk);
       auto const stats = get_statistics(chunk);
 
+      ASSERT_TRUE(stats.min_value.has_value());
+      ASSERT_TRUE(stats.max_value.has_value());
+
       // check trunc(page.min) <= stats.min && trun(page.max) >= stats.max
       auto const ptype = fmd.schema[c + 1].type;
       auto const ctype = fmd.schema[c + 1].converted_type;
-      EXPECT_TRUE(compare_binary(ci.min_values[0], stats.min_value, ptype, ctype) <= 0);
-      EXPECT_TRUE(compare_binary(ci.max_values[0], stats.max_value, ptype, ctype) >= 0);
+      EXPECT_TRUE(compare_binary(ci.min_values[0], stats.min_value.value(), ptype, ctype) <= 0);
+      EXPECT_TRUE(compare_binary(ci.max_values[0], stats.max_value.value(), ptype, ctype) >= 0);
 
       // check that truncated values == expected
       EXPECT_EQ(memcmp(ci.min_values[0].data(), truncated_min[c], ci.min_values[0].size()), 0);
@@ -4870,7 +4897,7 @@ TEST_F(ParquetWriterTest, BinaryColumnIndexTruncation)
   cudf::io::write_parquet(out_opts);
 
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
 
@@ -4885,8 +4912,10 @@ TEST_F(ParquetWriterTest, BinaryColumnIndexTruncation)
       // check trunc(page.min) <= stats.min && trun(page.max) >= stats.max
       auto const ptype = fmd.schema[c + 1].type;
       auto const ctype = fmd.schema[c + 1].converted_type;
-      EXPECT_TRUE(compare_binary(ci.min_values[0], stats.min_value, ptype, ctype) <= 0);
-      EXPECT_TRUE(compare_binary(ci.max_values[0], stats.max_value, ptype, ctype) >= 0);
+      ASSERT_TRUE(stats.min_value.has_value());
+      ASSERT_TRUE(stats.max_value.has_value());
+      EXPECT_TRUE(compare_binary(ci.min_values[0], stats.min_value.value(), ptype, ctype) <= 0);
+      EXPECT_TRUE(compare_binary(ci.max_values[0], stats.max_value.value(), ptype, ctype) >= 0);
 
       // check that truncated values == expected
       EXPECT_EQ(ci.min_values[0], truncated_min[c]);
@@ -5030,10 +5059,10 @@ TEST_F(ParquetReaderTest, NestedByteArray)
   cudf::io::write_parquet(out_opts);
 
   auto source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
-  EXPECT_EQ(fmd.schema[5].type, cudf::io::parquet::Type::BYTE_ARRAY);
+  EXPECT_EQ(fmd.schema[5].type, cudf::io::parquet::detail::Type::BYTE_ARRAY);
 
   std::vector<cudf::io::reader_column_schema> md{
     {},
@@ -5081,12 +5110,12 @@ TEST_F(ParquetWriterTest, ByteArrayStats)
   auto result = cudf::io::read_parquet(in_opts);
 
   auto source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
 
-  EXPECT_EQ(fmd.schema[1].type, cudf::io::parquet::Type::BYTE_ARRAY);
-  EXPECT_EQ(fmd.schema[2].type, cudf::io::parquet::Type::BYTE_ARRAY);
+  EXPECT_EQ(fmd.schema[1].type, cudf::io::parquet::detail::Type::BYTE_ARRAY);
+  EXPECT_EQ(fmd.schema[2].type, cudf::io::parquet::detail::Type::BYTE_ARRAY);
 
   auto const stats0 = get_statistics(fmd.row_groups[0].columns[0]);
   auto const stats1 = get_statistics(fmd.row_groups[0].columns[1]);
@@ -5137,9 +5166,9 @@ TEST_F(ParquetReaderTest, StructByteArray)
 
 TEST_F(ParquetReaderTest, NestingOptimizationTest)
 {
-  // test nesting levels > cudf::io::parquet::gpu::max_cacheable_nesting_decode_info deep.
+  // test nesting levels > cudf::io::parquet::detail::max_cacheable_nesting_decode_info deep.
   constexpr cudf::size_type num_nesting_levels = 16;
-  static_assert(num_nesting_levels > cudf::io::parquet::gpu::max_cacheable_nesting_decode_info);
+  static_assert(num_nesting_levels > cudf::io::parquet::detail::max_cacheable_nesting_decode_info);
   constexpr cudf::size_type rows_per_level = 2;
 
   constexpr cudf::size_type num_values = (1 << num_nesting_levels) * rows_per_level;
@@ -5206,13 +5235,13 @@ TEST_F(ParquetWriterTest, SingleValueDictionaryTest)
 
   // make sure dictionary was used
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
   auto used_dict = [&fmd]() {
     for (auto enc : fmd.row_groups[0].columns[0].meta_data.encodings) {
-      if (enc == cudf::io::parquet::Encoding::PLAIN_DICTIONARY or
-          enc == cudf::io::parquet::Encoding::RLE_DICTIONARY) {
+      if (enc == cudf::io::parquet::detail::Encoding::PLAIN_DICTIONARY or
+          enc == cudf::io::parquet::detail::Encoding::RLE_DICTIONARY) {
         return true;
       }
     }
@@ -5252,13 +5281,13 @@ TEST_F(ParquetWriterTest, DictionaryNeverTest)
 
   // make sure dictionary was not used
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
   auto used_dict = [&fmd]() {
     for (auto enc : fmd.row_groups[0].columns[0].meta_data.encodings) {
-      if (enc == cudf::io::parquet::Encoding::PLAIN_DICTIONARY or
-          enc == cudf::io::parquet::Encoding::RLE_DICTIONARY) {
+      if (enc == cudf::io::parquet::detail::Encoding::PLAIN_DICTIONARY or
+          enc == cudf::io::parquet::detail::Encoding::RLE_DICTIONARY) {
         return true;
       }
     }
@@ -5303,13 +5332,13 @@ TEST_F(ParquetWriterTest, DictionaryAdaptiveTest)
   // make sure dictionary was used as expected. col0 should use one,
   // col1 should not.
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
   auto used_dict = [&fmd](int col) {
     for (auto enc : fmd.row_groups[0].columns[col].meta_data.encodings) {
-      if (enc == cudf::io::parquet::Encoding::PLAIN_DICTIONARY or
-          enc == cudf::io::parquet::Encoding::RLE_DICTIONARY) {
+      if (enc == cudf::io::parquet::detail::Encoding::PLAIN_DICTIONARY or
+          enc == cudf::io::parquet::detail::Encoding::RLE_DICTIONARY) {
         return true;
       }
     }
@@ -5354,13 +5383,13 @@ TEST_F(ParquetWriterTest, DictionaryAlwaysTest)
 
   // make sure dictionary was used for both columns
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
   auto used_dict = [&fmd](int col) {
     for (auto enc : fmd.row_groups[0].columns[col].meta_data.encodings) {
-      if (enc == cudf::io::parquet::Encoding::PLAIN_DICTIONARY or
-          enc == cudf::io::parquet::Encoding::RLE_DICTIONARY) {
+      if (enc == cudf::io::parquet::detail::Encoding::PLAIN_DICTIONARY or
+          enc == cudf::io::parquet::detail::Encoding::RLE_DICTIONARY) {
         return true;
       }
     }
@@ -5438,13 +5467,13 @@ TEST_P(ParquetSizedTest, DictionaryTest)
 
   // make sure dictionary was used
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
   auto used_dict = [&fmd]() {
     for (auto enc : fmd.row_groups[0].columns[0].meta_data.encodings) {
-      if (enc == cudf::io::parquet::Encoding::PLAIN_DICTIONARY or
-          enc == cudf::io::parquet::Encoding::RLE_DICTIONARY) {
+      if (enc == cudf::io::parquet::detail::Encoding::PLAIN_DICTIONARY or
+          enc == cudf::io::parquet::detail::Encoding::RLE_DICTIONARY) {
         return true;
       }
     }
@@ -6664,7 +6693,7 @@ TEST_F(ParquetWriterTest, PreserveNullability)
 
 TEST_P(ParquetV2Test, CheckEncodings)
 {
-  using cudf::io::parquet::Encoding;
+  using cudf::io::parquet::detail::Encoding;
   constexpr auto num_rows = 100'000;
   auto const is_v2        = GetParam();
 
@@ -6672,7 +6701,7 @@ TEST_P(ParquetV2Test, CheckEncodings)
   // data should be PLAIN for v1, RLE for V2
   auto col0_data =
     cudf::detail::make_counting_transform_iterator(0, [](auto i) -> bool { return i % 2 == 0; });
-  // data should be PLAIN for both
+  // data should be PLAIN for v1, DELTA_BINARY_PACKED for v2
   auto col1_data = random_values<int32_t>(num_rows);
   // data should be PLAIN_DICTIONARY for v1, PLAIN and RLE_DICTIONARY for v2
   auto col2_data = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return 1; });
@@ -6697,7 +6726,7 @@ TEST_P(ParquetV2Test, CheckEncodings)
   };
 
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
   auto const& chunk0_enc = fmd.row_groups[0].columns[0].meta_data.encodings;
@@ -6707,10 +6736,10 @@ TEST_P(ParquetV2Test, CheckEncodings)
     // col0 should have RLE for rep/def and data
     EXPECT_TRUE(chunk0_enc.size() == 1);
     EXPECT_TRUE(contains(chunk0_enc, Encoding::RLE));
-    // col1 should have RLE for rep/def and PLAIN for data
+    // col1 should have RLE for rep/def and DELTA_BINARY_PACKED for data
     EXPECT_TRUE(chunk1_enc.size() == 2);
     EXPECT_TRUE(contains(chunk1_enc, Encoding::RLE));
-    EXPECT_TRUE(contains(chunk1_enc, Encoding::PLAIN));
+    EXPECT_TRUE(contains(chunk1_enc, Encoding::DELTA_BINARY_PACKED));
     // col2 should have RLE for rep/def, PLAIN for dict, and RLE_DICTIONARY for data
     EXPECT_TRUE(chunk2_enc.size() == 3);
     EXPECT_TRUE(contains(chunk2_enc, Encoding::RLE));
@@ -6732,4 +6761,212 @@ TEST_P(ParquetV2Test, CheckEncodings)
   }
 }
 
+// removing duration_D, duration_s, and timestamp_s as they don't appear to be supported properly.
+// see definition of UnsupportedChronoTypes above.
+using DeltaDecimalTypes = cudf::test::Types<numeric::decimal32, numeric::decimal64>;
+using DeltaBinaryTypes =
+  cudf::test::Concat<cudf::test::IntegralTypesNotBool, cudf::test::ChronoTypes, DeltaDecimalTypes>;
+using SupportedDeltaTestTypes =
+  cudf::test::RemoveIf<cudf::test::ContainedIn<UnsupportedChronoTypes>, DeltaBinaryTypes>;
+TYPED_TEST_SUITE(ParquetWriterDeltaTest, SupportedDeltaTestTypes);
+
+TYPED_TEST(ParquetWriterDeltaTest, SupportedDeltaTestTypes)
+{
+  using T   = TypeParam;
+  auto col0 = testdata::ascending<T>();
+  auto col1 = testdata::unordered<T>();
+
+  auto const expected = table_view{{col0, col1}};
+
+  auto const filepath = temp_env->get_temp_filepath("DeltaBinaryPacked.parquet");
+  cudf::io::parquet_writer_options out_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
+      .write_v2_headers(true)
+      .dictionary_policy(cudf::io::dictionary_policy::NEVER);
+  cudf::io::write_parquet(out_opts);
+
+  cudf::io::parquet_reader_options in_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_parquet(in_opts);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
+}
+
+TYPED_TEST(ParquetWriterDeltaTest, SupportedDeltaTestTypesSliced)
+{
+  using T                = TypeParam;
+  constexpr int num_rows = 4'000;
+  auto col0              = testdata::ascending<T>();
+  auto col1              = testdata::unordered<T>();
+
+  auto const expected = table_view{{col0, col1}};
+  auto expected_slice = cudf::slice(expected, {num_rows, 2 * num_rows});
+  ASSERT_EQ(expected_slice[0].num_rows(), num_rows);
+
+  auto const filepath = temp_env->get_temp_filepath("DeltaBinaryPackedSliced.parquet");
+  cudf::io::parquet_writer_options out_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected_slice)
+      .write_v2_headers(true)
+      .dictionary_policy(cudf::io::dictionary_policy::NEVER);
+  cudf::io::write_parquet(out_opts);
+
+  cudf::io::parquet_reader_options in_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_parquet(in_opts);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_slice, result.tbl->view());
+}
+
+TYPED_TEST(ParquetWriterDeltaTest, SupportedDeltaListSliced)
+{
+  using T = TypeParam;
+
+  constexpr int num_slice = 4'000;
+  constexpr int num_rows  = 32 * 1024;
+
+  std::mt19937 gen(6542);
+  std::bernoulli_distribution bn(0.7f);
+  auto valids =
+    cudf::detail::make_counting_transform_iterator(0, [&](int index) { return bn(gen); });
+  auto values = thrust::make_counting_iterator(0);
+
+  // list<T>
+  constexpr int vals_per_row = 4;
+  auto c1_offset_iter        = cudf::detail::make_counting_transform_iterator(
+    0, [vals_per_row](cudf::size_type idx) { return idx * vals_per_row; });
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> c1_offsets(c1_offset_iter,
+                                                                     c1_offset_iter + num_rows + 1);
+  cudf::test::fixed_width_column_wrapper<T> c1_vals(
+    values, values + (num_rows * vals_per_row), valids);
+  auto [null_mask, null_count] = cudf::test::detail::make_null_mask(valids, valids + num_rows);
+
+  auto _c1 = cudf::make_lists_column(
+    num_rows, c1_offsets.release(), c1_vals.release(), null_count, std::move(null_mask));
+  auto c1 = cudf::purge_nonempty_nulls(*_c1);
+
+  auto const expected = table_view{{*c1}};
+  auto expected_slice = cudf::slice(expected, {num_slice, 2 * num_slice});
+  ASSERT_EQ(expected_slice[0].num_rows(), num_slice);
+
+  auto const filepath = temp_env->get_temp_filepath("DeltaBinaryPackedListSliced.parquet");
+  cudf::io::parquet_writer_options out_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected_slice)
+      .write_v2_headers(true)
+      .dictionary_policy(cudf::io::dictionary_policy::NEVER);
+  cudf::io::write_parquet(out_opts);
+
+  cudf::io::parquet_reader_options in_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_parquet(in_opts);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_slice, result.tbl->view());
+}
+
+TEST_F(ParquetWriterTest, EmptyMinStringStatistics)
+{
+  char const* const min_val = "";
+  char const* const max_val = "zzz";
+  std::vector<char const*> strings{min_val, max_val, "pining", "for", "the", "fjords"};
+
+  column_wrapper<cudf::string_view> string_col{strings.begin(), strings.end()};
+  auto const output   = table_view{{string_col}};
+  auto const filepath = temp_env->get_temp_filepath("EmptyMinStringStatistics.parquet");
+  cudf::io::parquet_writer_options out_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, output);
+  cudf::io::write_parquet(out_opts);
+
+  auto const source = cudf::io::datasource::create(filepath);
+  cudf::io::parquet::detail::FileMetaData fmd;
+  read_footer(source, &fmd);
+
+  ASSERT_TRUE(fmd.row_groups.size() > 0);
+  ASSERT_TRUE(fmd.row_groups[0].columns.size() > 0);
+  auto const& chunk = fmd.row_groups[0].columns[0];
+  auto const stats  = get_statistics(chunk);
+
+  ASSERT_TRUE(stats.min_value.has_value());
+  ASSERT_TRUE(stats.max_value.has_value());
+  auto const min_value = std::string{reinterpret_cast<char const*>(stats.min_value.value().data()),
+                                     stats.min_value.value().size()};
+  auto const max_value = std::string{reinterpret_cast<char const*>(stats.max_value.value().data()),
+                                     stats.max_value.value().size()};
+  EXPECT_EQ(min_value, std::string(min_val));
+  EXPECT_EQ(max_value, std::string(max_val));
+}
+
+TEST_F(ParquetReaderTest, RepeatedNoAnnotations)
+{
+  constexpr unsigned char repeated_bytes[] = {
+    0x50, 0x41, 0x52, 0x31, 0x15, 0x04, 0x15, 0x30, 0x15, 0x30, 0x4c, 0x15, 0x0c, 0x15, 0x00, 0x12,
+    0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x04, 0x00,
+    0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x15, 0x00, 0x15, 0x0a, 0x15, 0x0a,
+    0x2c, 0x15, 0x0c, 0x15, 0x10, 0x15, 0x06, 0x15, 0x06, 0x00, 0x00, 0x03, 0x03, 0x88, 0xc6, 0x02,
+    0x26, 0x80, 0x01, 0x1c, 0x15, 0x02, 0x19, 0x25, 0x00, 0x10, 0x19, 0x18, 0x02, 0x69, 0x64, 0x15,
+    0x00, 0x16, 0x0c, 0x16, 0x78, 0x16, 0x78, 0x26, 0x54, 0x26, 0x08, 0x00, 0x00, 0x15, 0x04, 0x15,
+    0x40, 0x15, 0x40, 0x4c, 0x15, 0x08, 0x15, 0x00, 0x12, 0x00, 0x00, 0xe3, 0x0c, 0x23, 0x4b, 0x01,
+    0x00, 0x00, 0x00, 0xc7, 0x35, 0x3a, 0x42, 0x00, 0x00, 0x00, 0x00, 0x8e, 0x6b, 0x74, 0x84, 0x00,
+    0x00, 0x00, 0x00, 0x55, 0xa1, 0xae, 0xc6, 0x00, 0x00, 0x00, 0x00, 0x15, 0x00, 0x15, 0x22, 0x15,
+    0x22, 0x2c, 0x15, 0x10, 0x15, 0x10, 0x15, 0x06, 0x15, 0x06, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+    0x03, 0xc0, 0x03, 0x00, 0x00, 0x00, 0x03, 0x90, 0xaa, 0x02, 0x03, 0x94, 0x03, 0x26, 0xda, 0x02,
+    0x1c, 0x15, 0x04, 0x19, 0x25, 0x00, 0x10, 0x19, 0x38, 0x0c, 0x70, 0x68, 0x6f, 0x6e, 0x65, 0x4e,
+    0x75, 0x6d, 0x62, 0x65, 0x72, 0x73, 0x05, 0x70, 0x68, 0x6f, 0x6e, 0x65, 0x06, 0x6e, 0x75, 0x6d,
+    0x62, 0x65, 0x72, 0x15, 0x00, 0x16, 0x10, 0x16, 0xa0, 0x01, 0x16, 0xa0, 0x01, 0x26, 0x96, 0x02,
+    0x26, 0xba, 0x01, 0x00, 0x00, 0x15, 0x04, 0x15, 0x24, 0x15, 0x24, 0x4c, 0x15, 0x04, 0x15, 0x00,
+    0x12, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x68, 0x6f, 0x6d, 0x65, 0x06, 0x00, 0x00, 0x00, 0x6d,
+    0x6f, 0x62, 0x69, 0x6c, 0x65, 0x15, 0x00, 0x15, 0x20, 0x15, 0x20, 0x2c, 0x15, 0x10, 0x15, 0x10,
+    0x15, 0x06, 0x15, 0x06, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x03, 0xc0, 0x03, 0x00, 0x00, 0x00,
+    0x03, 0x90, 0xef, 0x01, 0x03, 0x04, 0x26, 0xcc, 0x04, 0x1c, 0x15, 0x0c, 0x19, 0x25, 0x00, 0x10,
+    0x19, 0x38, 0x0c, 0x70, 0x68, 0x6f, 0x6e, 0x65, 0x4e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x73, 0x05,
+    0x70, 0x68, 0x6f, 0x6e, 0x65, 0x04, 0x6b, 0x69, 0x6e, 0x64, 0x15, 0x00, 0x16, 0x10, 0x16, 0x82,
+    0x01, 0x16, 0x82, 0x01, 0x26, 0x8a, 0x04, 0x26, 0xca, 0x03, 0x00, 0x00, 0x15, 0x02, 0x19, 0x6c,
+    0x48, 0x04, 0x75, 0x73, 0x65, 0x72, 0x15, 0x04, 0x00, 0x15, 0x02, 0x25, 0x00, 0x18, 0x02, 0x69,
+    0x64, 0x00, 0x35, 0x02, 0x18, 0x0c, 0x70, 0x68, 0x6f, 0x6e, 0x65, 0x4e, 0x75, 0x6d, 0x62, 0x65,
+    0x72, 0x73, 0x15, 0x02, 0x00, 0x35, 0x04, 0x18, 0x05, 0x70, 0x68, 0x6f, 0x6e, 0x65, 0x15, 0x04,
+    0x00, 0x15, 0x04, 0x25, 0x00, 0x18, 0x06, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x00, 0x15, 0x0c,
+    0x25, 0x02, 0x18, 0x04, 0x6b, 0x69, 0x6e, 0x64, 0x25, 0x00, 0x00, 0x16, 0x00, 0x19, 0x1c, 0x19,
+    0x3c, 0x26, 0x80, 0x01, 0x1c, 0x15, 0x02, 0x19, 0x25, 0x00, 0x10, 0x19, 0x18, 0x02, 0x69, 0x64,
+    0x15, 0x00, 0x16, 0x0c, 0x16, 0x78, 0x16, 0x78, 0x26, 0x54, 0x26, 0x08, 0x00, 0x00, 0x26, 0xda,
+    0x02, 0x1c, 0x15, 0x04, 0x19, 0x25, 0x00, 0x10, 0x19, 0x38, 0x0c, 0x70, 0x68, 0x6f, 0x6e, 0x65,
+    0x4e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x73, 0x05, 0x70, 0x68, 0x6f, 0x6e, 0x65, 0x06, 0x6e, 0x75,
+    0x6d, 0x62, 0x65, 0x72, 0x15, 0x00, 0x16, 0x10, 0x16, 0xa0, 0x01, 0x16, 0xa0, 0x01, 0x26, 0x96,
+    0x02, 0x26, 0xba, 0x01, 0x00, 0x00, 0x26, 0xcc, 0x04, 0x1c, 0x15, 0x0c, 0x19, 0x25, 0x00, 0x10,
+    0x19, 0x38, 0x0c, 0x70, 0x68, 0x6f, 0x6e, 0x65, 0x4e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x73, 0x05,
+    0x70, 0x68, 0x6f, 0x6e, 0x65, 0x04, 0x6b, 0x69, 0x6e, 0x64, 0x15, 0x00, 0x16, 0x10, 0x16, 0x82,
+    0x01, 0x16, 0x82, 0x01, 0x26, 0x8a, 0x04, 0x26, 0xca, 0x03, 0x00, 0x00, 0x16, 0x9a, 0x03, 0x16,
+    0x0c, 0x00, 0x28, 0x49, 0x70, 0x61, 0x72, 0x71, 0x75, 0x65, 0x74, 0x2d, 0x72, 0x73, 0x20, 0x76,
+    0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x20, 0x30, 0x2e, 0x33, 0x2e, 0x30, 0x20, 0x28, 0x62, 0x75,
+    0x69, 0x6c, 0x64, 0x20, 0x62, 0x34, 0x35, 0x63, 0x65, 0x37, 0x63, 0x62, 0x61, 0x32, 0x31, 0x39,
+    0x39, 0x66, 0x32, 0x32, 0x64, 0x39, 0x33, 0x32, 0x36, 0x39, 0x63, 0x31, 0x35, 0x30, 0x64, 0x38,
+    0x61, 0x38, 0x33, 0x39, 0x31, 0x36, 0x63, 0x36, 0x39, 0x62, 0x35, 0x65, 0x29, 0x00, 0x32, 0x01,
+    0x00, 0x00, 0x50, 0x41, 0x52, 0x31};
+
+  auto read_opts = cudf::io::parquet_reader_options::builder(
+    cudf::io::source_info{reinterpret_cast<char const*>(repeated_bytes), sizeof(repeated_bytes)});
+  auto result = cudf::io::read_parquet(read_opts);
+
+  EXPECT_EQ(result.tbl->view().column(0).size(), 6);
+  EXPECT_EQ(result.tbl->view().num_columns(), 2);
+
+  column_wrapper<int32_t> col0{1, 2, 3, 4, 5, 6};
+  column_wrapper<int64_t> child0{{5555555555l, 1111111111l, 1111111111l, 2222222222l, 3333333333l}};
+  cudf::test::strings_column_wrapper child1{{"-", "home", "home", "-", "mobile"}, {0, 1, 1, 0, 1}};
+  auto struct_col = cudf::test::structs_column_wrapper{{child0, child1}};
+
+  auto list_offsets_column =
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 0, 0, 0, 1, 2, 5}.release();
+  auto num_list_rows = list_offsets_column->size() - 1;
+
+  auto mask = cudf::create_null_mask(6, cudf::mask_state::ALL_VALID);
+  cudf::set_null_mask(static_cast<cudf::bitmask_type*>(mask.data()), 0, 2, false);
+
+  auto list_col = cudf::make_lists_column(
+    num_list_rows, std::move(list_offsets_column), struct_col.release(), 2, std::move(mask));
+
+  std::vector<std::unique_ptr<cudf::column>> struct_children;
+  struct_children.push_back(std::move(list_col));
+
+  auto outer_struct =
+    cudf::test::structs_column_wrapper{{std::move(struct_children)}, {0, 0, 1, 1, 1, 1}};
+  table_view expected{{col0, outer_struct}};
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(result.tbl->view(), expected);
+}
+
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/iterator/indexalator_test.cu b/cpp/tests/iterator/indexalator_test.cu
index 1ff7f4c42a5..0c10853ec02 100644
--- a/cpp/tests/iterator/indexalator_test.cu
+++ b/cpp/tests/iterator/indexalator_test.cu
@@ -20,9 +20,13 @@
 
 #include <cudf/detail/indexalator.cuh>
 
+#include <thrust/binary_search.h>
+#include <thrust/gather.h>
 #include <thrust/host_vector.h>
 #include <thrust/optional.h>
 #include <thrust/pair.h>
+#include <thrust/scatter.h>
+#include <thrust/sequence.h>
 
 using TestingTypes = cudf::test::IntegralTypesNotBool;
 
@@ -94,3 +98,62 @@ TYPED_TEST(IndexalatorTest, optional_iterator)
   auto it_dev = cudf::detail::indexalator_factory::make_input_optional_iterator(d_col);
   this->iterator_test_thrust(expected_values, it_dev, host_values.size());
 }
+
+template <typename Integer>
+struct transform_fn {
+  __device__ cudf::size_type operator()(Integer v)
+  {
+    return static_cast<cudf::size_type>(v) + static_cast<cudf::size_type>(v);
+  }
+};
+
+TYPED_TEST(IndexalatorTest, output_iterator)
+{
+  using T = TypeParam;
+
+  auto d_col1 =
+    cudf::test::fixed_width_column_wrapper<T, int32_t>({0, 6, 7, 14, 23, 33, 43, 45, 63});
+  auto d_col2 =
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>({0, 0, 0, 0, 0, 0, 0, 0, 0});
+  auto itr    = cudf::detail::indexalator_factory::make_output_iterator(d_col2);
+  auto input  = cudf::column_view(d_col1);
+  auto stream = cudf::get_default_stream();
+
+  auto map   = cudf::test::fixed_width_column_wrapper<int>({0, 2, 4, 6, 8, 1, 3, 5, 7});
+  auto d_map = cudf::column_view(map);
+  thrust::gather(
+    rmm::exec_policy_nosync(stream), d_map.begin<int>(), d_map.end<int>(), input.begin<T>(), itr);
+  auto expected =
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>({0, 7, 23, 43, 63, 6, 14, 33, 45});
+  thrust::scatter(
+    rmm::exec_policy_nosync(stream), input.begin<T>(), input.end<T>(), d_map.begin<int>(), itr);
+  expected =
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>({0, 33, 6, 43, 7, 45, 14, 63, 23});
+
+  thrust::transform(
+    rmm::exec_policy(stream), input.begin<T>(), input.end<T>(), itr, transform_fn<T>{});
+  expected =
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>({0, 12, 14, 28, 46, 66, 86, 90, 126});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(d_col2, expected);
+
+  thrust::fill(rmm::exec_policy(stream), itr, itr + input.size(), 77);
+  expected =
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>({77, 77, 77, 77, 77, 77, 77, 77, 77});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(d_col2, expected);
+
+  thrust::sequence(rmm::exec_policy(stream), itr, itr + input.size());
+  expected = cudf::test::fixed_width_column_wrapper<cudf::size_type>({0, 1, 2, 3, 4, 5, 6, 7, 8});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(d_col2, expected);
+
+  auto indices =
+    cudf::test::fixed_width_column_wrapper<T, int32_t>({0, 10, 20, 30, 40, 50, 60, 70, 80});
+  auto d_indices = cudf::column_view(indices);
+  thrust::lower_bound(rmm::exec_policy(stream),
+                      d_indices.begin<T>(),
+                      d_indices.end<T>(),
+                      input.begin<T>(),
+                      input.end<T>(),
+                      itr);
+  expected = cudf::test::fixed_width_column_wrapper<cudf::size_type>({0, 1, 1, 2, 3, 4, 5, 5, 7});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(d_col2, expected);
+}
diff --git a/cpp/tests/iterator/offsetalator_test.cu b/cpp/tests/iterator/offsetalator_test.cu
new file mode 100644
index 00000000000..e569e58f42a
--- /dev/null
+++ b/cpp/tests/iterator/offsetalator_test.cu
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS,  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+#include <tests/iterator/iterator_tests.cuh>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/detail/offsets_iterator_factory.cuh>
+
+#include <thrust/binary_search.h>
+#include <thrust/gather.h>
+#include <thrust/host_vector.h>
+#include <thrust/optional.h>
+#include <thrust/pair.h>
+#include <thrust/scatter.h>
+#include <thrust/sequence.h>
+#include <thrust/transform.h>
+
+using TestingTypes = cudf::test::Types<int32_t, int64_t>;
+
+template <typename T>
+struct OffsetalatorTest : public IteratorTest<T> {};
+
+TYPED_TEST_SUITE(OffsetalatorTest, TestingTypes);
+
+TYPED_TEST(OffsetalatorTest, input_iterator)
+{
+  using T = TypeParam;
+
+  auto host_values = cudf::test::make_type_param_vector<T>({0, 6, 0, -14, 13, 64, -13, -20, 45});
+
+  auto d_col = cudf::test::fixed_width_column_wrapper<T>(host_values.begin(), host_values.end());
+
+  auto expected_values = thrust::host_vector<cudf::size_type>(host_values.size());
+  std::transform(host_values.begin(), host_values.end(), expected_values.begin(), [](auto v) {
+    return static_cast<cudf::size_type>(v);
+  });
+
+  auto it_dev = cudf::detail::offsetalator_factory::make_input_iterator(d_col);
+  this->iterator_test_thrust(expected_values, it_dev, host_values.size());
+}
+
+TYPED_TEST(OffsetalatorTest, output_iterator)
+{
+  using T = TypeParam;
+
+  auto d_col1 = cudf::test::fixed_width_column_wrapper<int64_t>({0, 6, 7, 14, 23, 33, 43, 45, 63});
+  auto d_col2 = cudf::test::fixed_width_column_wrapper<T>({0, 0, 0, 0, 0, 0, 0, 0, 0});
+  auto itr    = cudf::detail::offsetalator_factory::make_output_iterator(d_col2);
+  auto input  = cudf::column_view(d_col1);
+  auto stream = cudf::get_default_stream();
+
+  auto map   = cudf::test::fixed_width_column_wrapper<int>({0, 2, 4, 6, 8, 1, 3, 5, 7});
+  auto d_map = cudf::column_view(map);
+  thrust::gather(rmm::exec_policy_nosync(stream),
+                 d_map.begin<int>(),
+                 d_map.end<int>(),
+                 input.begin<int64_t>(),
+                 itr);
+  auto expected = cudf::test::fixed_width_column_wrapper<T>({0, 7, 23, 43, 63, 6, 14, 33, 45});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(d_col2, expected);
+
+  thrust::scatter(rmm::exec_policy_nosync(stream),
+                  input.begin<int64_t>(),
+                  input.end<int64_t>(),
+                  d_map.begin<int>(),
+                  itr);
+  expected = cudf::test::fixed_width_column_wrapper<T>({0, 33, 6, 43, 7, 45, 14, 63, 23});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(d_col2, expected);
+
+  thrust::fill(rmm::exec_policy(stream), itr, itr + input.size(), 77);
+  expected = cudf::test::fixed_width_column_wrapper<T>({77, 77, 77, 77, 77, 77, 77, 77, 77});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(d_col2, expected);
+
+  thrust::sequence(rmm::exec_policy(stream), itr, itr + input.size());
+  expected = cudf::test::fixed_width_column_wrapper<T>({0, 1, 2, 3, 4, 5, 6, 7, 8});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(d_col2, expected);
+
+  auto offsets =
+    cudf::test::fixed_width_column_wrapper<int64_t>({0, 10, 20, 30, 40, 50, 60, 70, 80});
+  auto d_offsets = cudf::column_view(offsets);
+  thrust::lower_bound(rmm::exec_policy(stream),
+                      d_offsets.begin<int64_t>(),
+                      d_offsets.end<int64_t>(),
+                      input.begin<int64_t>(),
+                      input.end<int64_t>(),
+                      itr);
+  expected = cudf::test::fixed_width_column_wrapper<T>({0, 1, 1, 2, 3, 4, 5, 5, 7});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(d_col2, expected);
+}
+
+namespace {
+/**
+ * For testing creating and using the offsetalator in device code.
+ */
+struct device_functor_fn {
+  cudf::column_device_view const d_col;
+  __device__ int32_t operator()(int idx)
+  {
+    auto const itr = cudf::detail::input_offsetalator(d_col.head(), d_col.type());
+    return static_cast<int32_t>(itr[idx] * 3);
+  }
+};
+}  // namespace
+
+TYPED_TEST(OffsetalatorTest, device_offsetalator)
+{
+  using T = TypeParam;
+
+  auto d_col1 = cudf::test::fixed_width_column_wrapper<T>({0, 6, 7, 14, 23, 33, 43, 45, 63});
+  auto d_col2 = cudf::test::fixed_width_column_wrapper<int32_t>({0, 0, 0, 0, 0, 0, 0, 0, 0});
+  auto input  = cudf::column_view(d_col1);
+  auto output = cudf::mutable_column_view(d_col2);
+  auto stream = cudf::get_default_stream();
+
+  auto d_input = cudf::column_device_view::create(input, stream);
+
+  thrust::transform(rmm::exec_policy(stream),
+                    thrust::counting_iterator<int>(0),
+                    thrust::counting_iterator<int>(input.size()),
+                    output.begin<int32_t>(),
+                    device_functor_fn{*d_input});
+
+  auto expected =
+    cudf::test::fixed_width_column_wrapper<int32_t>({0, 18, 21, 42, 69, 99, 129, 135, 189});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(d_col2, expected);
+}
diff --git a/cpp/tests/jit/parse_ptx_function.cpp b/cpp/tests/jit/parse_ptx_function.cpp
new file mode 100644
index 00000000000..5f00c5f561a
--- /dev/null
+++ b/cpp/tests/jit/parse_ptx_function.cpp
@@ -0,0 +1,218 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <algorithm>
+#include <cctype>
+
+#include <cudf_test/base_fixture.hpp>
+#include <jit/parser.hpp>
+
+struct JitParseTest : public ::testing::Test {};
+
+TEST_F(JitParseTest, PTXNoFunction)
+{
+  std::string raw_ptx = R"(
+.visible .entry _ZN3cub17CUB_101702_750_NS11EmptyKernelIvEEvv()
+{
+  ret;
+})";
+
+  EXPECT_THROW(cudf::jit::parse_single_function_ptx(raw_ptx, "GENERIC_OP", "float", {0}),
+               cudf::logic_error);
+}
+
+inline bool ptx_equal(std::string input, std::string expected)
+{
+  // Remove all whitespace and newline characters and compare
+  // This allows us to handle things like excess newline characters
+  // and trailing whitespace in the 'input'
+
+  auto whitespace_or_newline = [](unsigned char c) { return std::isspace(c) || c == '\n'; };
+  input.erase(std::remove_if(input.begin(), input.end(), whitespace_or_newline), input.end());
+  expected.erase(std::remove_if(expected.begin(), expected.end(), whitespace_or_newline),
+                 expected.end());
+  return input == expected;
+}
+
+TEST_F(JitParseTest, SimplePTX)
+{
+  std::string raw_ptx = R"(
+.visible .func  (.param .b32 func_retval0) _ZN8__main__7add$241Eff(
+  .param .b64 _ZN8__main__7add$241Eff_param_0,
+  .param .b32 _ZN8__main__7add$241Eff_param_1,
+  .param .b32 _ZN8__main__7add$241Eff_param_2
+)
+{
+  ret;
+}
+)";
+
+  std::string expected = R"(
+__device__ __inline__ void GENERIC_OP(
+  float* _ZN8__main__7add_241Eff_param_0,
+  int _ZN8__main__7add_241Eff_param_1,
+  int _ZN8__main__7add_241Eff_param_2
+){
+ asm volatile ("{");
+ asm volatile ("bra RETTGT;");
+ asm volatile ("RETTGT:}");}
+)";
+
+  std::string cuda_source =
+    cudf::jit::parse_single_function_ptx(raw_ptx, "GENERIC_OP", "float", {0});
+
+  EXPECT_TRUE(ptx_equal(cuda_source, expected));
+}
+
+TEST_F(JitParseTest, PTXWithPragma)
+{
+  std::string raw_ptx = R"(
+.visible .func _ZN3cub17CUB_101702_750_NS11EmptyKernelIvEEvv()
+{
+$L__BB0_151:
+  .pragma "nounroll";
+  mov.u32 % r1517, % r1516;
+  mov.u32 % r1516, % r1515;
+  mov.u32 % r1515, % r1505;
+  mov.u32 % r1457, 0;
+$L__BB0_152:
+  .pragma "nounroll";
+})";
+
+  std::string expected = R"(
+__device__ __inline__ void EmptyKern(){
+ asm volatile ("{");  asm volatile (" $L__BB0_151:  .pragma \"nounroll\";");
+   /**   $L__BB0_151:
+  .pragma "nounroll"  */
+
+  asm volatile ("  mov.u32 _ r1517, _ r1516;");
+   /**   mov.u32 % r1517, % r1516  */
+
+  asm volatile ("  mov.u32 _ r1516, _ r1515;");
+   /**   mov.u32 % r1516, % r1515  */
+
+  asm volatile ("  mov.u32 _ r1515, _ r1505;");
+   /**   mov.u32 % r1515, % r1505  */
+
+  asm volatile ("  mov.u32 _ r1457, 0;");
+   /**   mov.u32 % r1457, 0  */
+
+  asm volatile (" $L__BB0_152:  .pragma \"nounroll\";");
+   /**   $L__BB0_152:
+  .pragma "nounroll"  */
+
+ asm volatile ("RETTGT:}");}
+)";
+
+  std::string cuda_source = cudf::jit::parse_single_function_ptx(raw_ptx, "EmptyKern", "void", {0});
+  EXPECT_TRUE(ptx_equal(cuda_source, expected));
+}
+
+TEST_F(JitParseTest, PTXWithPragmaWithSpaces)
+{
+  std::string raw_ptx = R"(
+.visible .func _ZN3cub17CUB_101702_750_NS11EmptyKernelIvEEvv()
+{
+  $L__BB0_58:
+    ld.param.u32 % r1419, [% rd419 + 80];
+    setp.ne.s32 % p394, % r1419, 22;
+    mov.u32 % r2050, 0;
+    mov.u32 % r2048, % r2050;
+    @ % p394 bra $L__BB0_380;
+
+    ld.param.u8 % rs1369, [% rd419 + 208];
+    setp.eq.s16 % p395, % rs1369, 0;
+    selp.b32 % r1422, % r1925, 0, % p395;
+    ld.param.u32 % r1423, [% rd419 + 112];
+    add.s32 % r427, % r1422, % r1423;
+    ld.param.u64 % rd1249, [% rd419 + 120];
+    cvta.to.global.u64 % rd1250, % rd1249;
+    .pragma "used_bytes_mask 4095";
+    ld.global.v4.u32{ % r1424, % r1425, % r1426, % r1427}, [% rd1250];
+    ld.global.v2.u64{ % rd1251, % rd1252}, [% rd1250 + 16];
+    ld.global.s32 % rd230, [% rd1250 + 32];
+    setp.gt.s32 % p396, % r1424, 6;
+    @ % p396 bra $L__BB0_376;
+}
+}
+)";
+
+  std::string expected = R"(
+__device__ __inline__ void LongKernel(){
+ asm volatile ("{");  asm volatile (" $L__BB0_58:  cvt.u32.u32 _  %0, [_ rd419 + 80];": : "r"(r1419));
+   /**   $L__BB0_58:
+    ld.param.u32 % r1419, [% rd419 + 80]  */
+
+  asm volatile ("  setp.ne.s32 _ p394, _ r1419, 22;");
+   /**   setp.ne.s32 % p394, % r1419, 22  */
+
+  asm volatile ("  mov.u32 _ r2050, 0;");
+   /**   mov.u32 % r2050, 0  */
+
+  asm volatile ("  mov.u32 _ r2048, _ r2050;");
+   /**   mov.u32 % r2048, % r2050  */
+
+  asm volatile ("  @ _ p394 bra $L__BB0_380;");
+   /**   @ % p394 bra $L__BB0_380  */
+
+  asm volatile ("  cvt.u8.u8 _  %0, [_ rd419 + 208];": : "h"( static_cast<short>(rs1369)));
+   /**   ld.param.u8 % rs1369, [% rd419 + 208]  */
+
+  asm volatile ("  setp.eq.s16 _ p395, _ rs1369, 0;");
+   /**   setp.eq.s16 % p395, % rs1369, 0  */
+
+  asm volatile ("  selp.b32 _ r1422, _ r1925, 0, _ p395;");
+   /**   selp.b32 % r1422, % r1925, 0, % p395  */
+
+  asm volatile ("  cvt.u32.u32 _  %0, [_ rd419 + 112];": : "r"(r1423));
+   /**   ld.param.u32 % r1423, [% rd419 + 112]  */
+
+  asm volatile ("  add.s32 _ r427, _ r1422, _ r1423;");
+   /**   add.s32 % r427, % r1422, % r1423  */
+
+  asm volatile ("  mov.u64 _  %0, [_ rd419 + 120];": : "l"(rd1249));
+   /**   ld.param.u64 % rd1249, [% rd419 + 120]  */
+
+  asm volatile ("  cvta.to.global.u64 _ rd1250, _ rd1249;");
+   /**   cvta.to.global.u64 % rd1250, % rd1249  */
+
+  asm volatile ("  .pragma \"used_bytes_mask 4095\";");
+   /**   .pragma "used_bytes_mask 4095"  */
+
+  asm volatile ("  ld.global.v4.u32{ _ r1424, _ r1425, _ r1426, _ r1427}, [_ rd1250];");
+   /**   ld.global.v4.u32{ % r1424, % r1425, % r1426, % r1427}, [% rd1250]  */
+
+  asm volatile ("  ld.global.v2.u64{ _ rd1251, _ rd1252}, [_ rd1250 + 16];");
+   /**   ld.global.v2.u64{ % rd1251, % rd1252}, [% rd1250 + 16]  */
+
+  asm volatile ("  ld.global.s32 _ rd230, [_ rd1250 + 32];");
+   /**   ld.global.s32 % rd230, [% rd1250 + 32]  */
+
+  asm volatile ("  setp.gt.s32 _ p396, _ r1424, 6;");
+   /**   setp.gt.s32 % p396, % r1424, 6  */
+
+  asm volatile ("  @ _ p396 bra $L__BB0_376;");
+   /**   @ % p396 bra $L__BB0_376  */
+
+  asm volatile ("RETTGT:}");}
+ )";
+
+  std::string cuda_source =
+    cudf::jit::parse_single_function_ptx(raw_ptx, "LongKernel", "void", {0});
+  EXPECT_TRUE(ptx_equal(cuda_source, expected));
+}
+
+CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/join/join_tests.cpp b/cpp/tests/join/join_tests.cpp
index 089db315748..a416df0c7c3 100644
--- a/cpp/tests/join/join_tests.cpp
+++ b/cpp/tests/join/join_tests.cpp
@@ -1941,62 +1941,6 @@ TEST_F(JoinTest, FullJoinWithStructsAndNulls)
   CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result);
 }
 
-TEST_F(JoinTest, Repro_StructsWithoutNullsPushedDown)
-{
-  // When joining on a STRUCT column, if the parent nulls are not reflected in
-  // the children, the join might produce incorrect results.
-  //
-  // In this test, a fact table of structs is joined against a dimension table.
-  // Both tables must match (only) on the NULL row. This will fail if the fact table's
-  // nulls are not pushed down into its children.
-  using ints    = column_wrapper<int32_t>;
-  using structs = cudf::test::structs_column_wrapper;
-  using namespace cudf::test::iterators;
-
-  auto make_table = [](auto&& col) {
-    auto columns = CVector{};
-    columns.push_back(std::move(col));
-    return cudf::table{std::move(columns)};
-  };
-
-  auto const fact_table = [make_table] {
-    auto fact_ints    = ints{0, 1, 2, 3, 4};
-    auto fact_structs = structs{{fact_ints}, no_nulls()}.release();
-    // Now set struct validity to invalidate index#3.
-    cudf::detail::set_null_mask(
-      fact_structs->mutable_view().null_mask(), 3, 4, false, cudf::get_default_stream());
-    // Struct row#3 is null, but Struct.child has a non-null value.
-    return make_table(std::move(fact_structs));
-  }();
-
-  auto const dimension_table = [make_table] {
-    auto dim_ints    = ints{999};
-    auto dim_structs = structs{{dim_ints}, null_at(0)};
-    return make_table(dim_structs.release());
-  }();
-
-  auto const result = inner_join(fact_table.view(), dimension_table.view(), {0}, {0});
-  EXPECT_EQ(result->num_rows(), 1);  // The null STRUCT rows should match.
-
-  // Note: Join result might not have nulls pushed down, since it's an output of gather().
-  // Must superimpose parent nulls before comparisons.
-  auto [superimposed_results, _] = cudf::structs::detail::push_down_nulls(
-    *result, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
-
-  auto const expected = [] {
-    auto fact_ints    = ints{0};
-    auto fact_structs = structs{{fact_ints}, null_at(0)};
-    auto dim_ints     = ints{0};
-    auto dim_structs  = structs{{dim_ints}, null_at(0)};
-    auto columns      = CVector{};
-    columns.push_back(fact_structs.release());
-    columns.push_back(dim_structs.release());
-    return cudf::table{std::move(columns)};
-  }();
-
-  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(superimposed_results, expected);
-}
-
 using lcw = cudf::test::lists_column_wrapper<int32_t>;
 using cudf::test::iterators::null_at;
 
diff --git a/cpp/tests/strings/json_tests.cpp b/cpp/tests/json/json_tests.cpp
similarity index 84%
rename from cpp/tests/strings/json_tests.cpp
rename to cpp/tests/json/json_tests.cpp
index d74bb9258fa..a03880eef5d 100644
--- a/cpp/tests/strings/json_tests.cpp
+++ b/cpp/tests/json/json_tests.cpp
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
+#include <cudf/json/json.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
-#include <cudf/strings/json.hpp>
 #include <cudf/strings/replace.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
@@ -85,7 +85,7 @@ TEST_F(JsonPathTests, GetJsonObjectRootOp)
   // root
   cudf::test::strings_column_wrapper input{json_string};
   std::string json_path("$");
-  auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+  auto result_raw = cudf::get_json_object(cudf::strings_column_view(input), json_path);
   auto result     = drop_whitespace(*result_raw);
 
   auto expected = drop_whitespace(input);
@@ -98,7 +98,7 @@ TEST_F(JsonPathTests, GetJsonObjectChildOp)
   {
     cudf::test::strings_column_wrapper input{json_string};
     std::string json_path("$.store");
-    auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+    auto result_raw = cudf::get_json_object(cudf::strings_column_view(input), json_path);
     auto result     = drop_whitespace(*result_raw);
 
     // clang-format off
@@ -147,7 +147,7 @@ TEST_F(JsonPathTests, GetJsonObjectChildOp)
   {
     cudf::test::strings_column_wrapper input{json_string};
     std::string json_path("$.store.book");
-    auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+    auto result_raw = cudf::get_json_object(cudf::strings_column_view(input), json_path);
     auto result     = drop_whitespace(*result_raw);
 
     // clang-format off
@@ -193,7 +193,7 @@ TEST_F(JsonPathTests, GetJsonObjectWildcardOp)
   {
     cudf::test::strings_column_wrapper input{json_string};
     std::string json_path("$.store.*");
-    auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+    auto result_raw = cudf::get_json_object(cudf::strings_column_view(input), json_path);
     auto result     = drop_whitespace(*result_raw);
 
     // clang-format off
@@ -242,7 +242,7 @@ TEST_F(JsonPathTests, GetJsonObjectWildcardOp)
   {
     cudf::test::strings_column_wrapper input{json_string};
     std::string json_path("*");
-    auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+    auto result_raw = cudf::get_json_object(cudf::strings_column_view(input), json_path);
     auto result     = drop_whitespace(*result_raw);
 
     // clang-format off
@@ -297,7 +297,7 @@ TEST_F(JsonPathTests, GetJsonObjectSubscriptOp)
   {
     cudf::test::strings_column_wrapper input{json_string};
     std::string json_path("$.store.book[2]");
-    auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+    auto result_raw = cudf::get_json_object(cudf::strings_column_view(input), json_path);
     auto result     = drop_whitespace(*result_raw);
 
     // clang-format off
@@ -319,7 +319,7 @@ TEST_F(JsonPathTests, GetJsonObjectSubscriptOp)
   {
     cudf::test::strings_column_wrapper input{json_string};
     std::string json_path("$.store['bicycle']");
-    auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+    auto result_raw = cudf::get_json_object(cudf::strings_column_view(input), json_path);
     auto result     = drop_whitespace(*result_raw);
 
     // clang-format off
@@ -338,7 +338,7 @@ TEST_F(JsonPathTests, GetJsonObjectSubscriptOp)
   {
     cudf::test::strings_column_wrapper input{json_string};
     std::string json_path("$.store.book[*]");
-    auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+    auto result_raw = cudf::get_json_object(cudf::strings_column_view(input), json_path);
     auto result     = drop_whitespace(*result_raw);
 
     // clang-format off
@@ -387,7 +387,7 @@ TEST_F(JsonPathTests, GetJsonObjectFilter)
   {
     cudf::test::strings_column_wrapper input{json_string};
     std::string json_path("$.store.book[*]['isbn']");
-    auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+    auto result_raw = cudf::get_json_object(cudf::strings_column_view(input), json_path);
     auto result     = drop_whitespace(*result_raw);
 
     cudf::test::strings_column_wrapper expected_raw{R"(["0-553-21311-3","0-395-19395-8"])"};
@@ -399,7 +399,7 @@ TEST_F(JsonPathTests, GetJsonObjectFilter)
   {
     cudf::test::strings_column_wrapper input{json_string};
     std::string json_path("$.store.book[*].category");
-    auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+    auto result_raw = cudf::get_json_object(cudf::strings_column_view(input), json_path);
     auto result     = drop_whitespace(*result_raw);
 
     cudf::test::strings_column_wrapper expected_raw{
@@ -412,7 +412,7 @@ TEST_F(JsonPathTests, GetJsonObjectFilter)
   {
     cudf::test::strings_column_wrapper input{json_string};
     std::string json_path("$.store.book[*].title");
-    auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+    auto result_raw = cudf::get_json_object(cudf::strings_column_view(input), json_path);
     auto result     = drop_whitespace(*result_raw);
 
     cudf::test::strings_column_wrapper expected_raw{
@@ -425,7 +425,7 @@ TEST_F(JsonPathTests, GetJsonObjectFilter)
   {
     cudf::test::strings_column_wrapper input{json_string};
     std::string json_path("$.store.book.*.price");
-    auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+    auto result_raw = cudf::get_json_object(cudf::strings_column_view(input), json_path);
     auto result     = drop_whitespace(*result_raw);
 
     cudf::test::strings_column_wrapper expected_raw{"[8.95,12.99,8.99,22.99]"};
@@ -440,7 +440,7 @@ TEST_F(JsonPathTests, GetJsonObjectFilter)
     //  spark:        fiction
     cudf::test::strings_column_wrapper input{json_string};
     std::string json_path("$.store.book[2].category");
-    auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+    auto result_raw = cudf::get_json_object(cudf::strings_column_view(input), json_path);
     auto result     = drop_whitespace(*result_raw);
 
     cudf::test::strings_column_wrapper expected_raw{"fiction"};
@@ -457,7 +457,7 @@ TEST_F(JsonPathTests, GetJsonObjectNullInputs)
     cudf::test::strings_column_wrapper input({str, str, str, str}, {1, 0, 1, 0});
 
     std::string json_path("$.a");
-    auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+    auto result_raw = cudf::get_json_object(cudf::strings_column_view(input), json_path);
     auto result     = drop_whitespace(*result_raw);
 
     cudf::test::strings_column_wrapper expected_raw({"b", "", "b", ""}, {1, 0, 1, 0});
@@ -473,7 +473,7 @@ TEST_F(JsonPathTests, GetJsonObjectEmptyQuery)
   {
     cudf::test::strings_column_wrapper input{R"({"a" : "b"})"};
     std::string json_path("");
-    auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+    auto result = cudf::get_json_object(cudf::strings_column_view(input), json_path);
 
     cudf::test::strings_column_wrapper expected({""}, {0});
 
@@ -487,7 +487,7 @@ TEST_F(JsonPathTests, GetJsonObjectEmptyInputsAndOutputs)
   {
     cudf::test::strings_column_wrapper input{""};
     std::string json_path("$");
-    auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+    auto result = cudf::get_json_object(cudf::strings_column_view(input), json_path);
 
     cudf::test::strings_column_wrapper expected({""}, {0});
 
@@ -500,7 +500,7 @@ TEST_F(JsonPathTests, GetJsonObjectEmptyInputsAndOutputs)
   {
     cudf::test::strings_column_wrapper input{R"({"store": { "bicycle" : "" } })"};
     std::string json_path("$.store.bicycle");
-    auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+    auto result = cudf::get_json_object(cudf::strings_column_view(input), json_path);
 
     cudf::test::strings_column_wrapper expected({""}, {1});
 
@@ -512,7 +512,7 @@ TEST_F(JsonPathTests, GetJsonObjectEmptyInput)
 {
   cudf::test::strings_column_wrapper input{};
   std::string json_path("$");
-  auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+  auto result = cudf::get_json_object(cudf::strings_column_view(input), json_path);
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, input);
 }
 
@@ -525,7 +525,7 @@ TEST_F(JsonPathTests, GetJsonObjectIllegalQuery)
     cudf::test::strings_column_wrapper input{R"({"a": "b"})"};
     std::string json_path("$$");
     auto query = [&]() {
-      auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+      auto result = cudf::get_json_object(cudf::strings_column_view(input), json_path);
     };
     EXPECT_THROW(query(), cudf::logic_error);
   }
@@ -535,7 +535,7 @@ TEST_F(JsonPathTests, GetJsonObjectIllegalQuery)
     cudf::test::strings_column_wrapper input{R"({"a": "b"})"};
     std::string json_path("$[auh46h-]");
     auto query = [&]() {
-      auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+      auto result = cudf::get_json_object(cudf::strings_column_view(input), json_path);
     };
     EXPECT_THROW(query(), cudf::logic_error);
   }
@@ -545,7 +545,7 @@ TEST_F(JsonPathTests, GetJsonObjectIllegalQuery)
     cudf::test::strings_column_wrapper input{R"({"a": "b"})"};
     std::string json_path("$[[]]");
     auto query = [&]() {
-      auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+      auto result = cudf::get_json_object(cudf::strings_column_view(input), json_path);
     };
     EXPECT_THROW(query(), cudf::logic_error);
   }
@@ -555,7 +555,7 @@ TEST_F(JsonPathTests, GetJsonObjectIllegalQuery)
     cudf::test::strings_column_wrapper input{R"({"a": "b"})"};
     std::string json_path("$[-1]");
     auto query = [&]() {
-      auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+      auto result = cudf::get_json_object(cudf::strings_column_view(input), json_path);
     };
     EXPECT_THROW(query(), cudf::logic_error);
   }
@@ -565,7 +565,7 @@ TEST_F(JsonPathTests, GetJsonObjectIllegalQuery)
     cudf::test::strings_column_wrapper input{R"({"a": "b"})"};
     std::string json_path(".");
     auto query = [&]() {
-      auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+      auto result = cudf::get_json_object(cudf::strings_column_view(input), json_path);
     };
     EXPECT_THROW(query(), std::invalid_argument);
   }
@@ -574,7 +574,7 @@ TEST_F(JsonPathTests, GetJsonObjectIllegalQuery)
     cudf::test::strings_column_wrapper input{R"({"a": "b"})"};
     std::string json_path("][");
     auto query = [&]() {
-      auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+      auto result = cudf::get_json_object(cudf::strings_column_view(input), json_path);
     };
     EXPECT_THROW(query(), std::invalid_argument);
   }
@@ -583,7 +583,7 @@ TEST_F(JsonPathTests, GetJsonObjectIllegalQuery)
     cudf::test::strings_column_wrapper input{R"({"a": "b"})"};
     std::string json_path("6hw6,56i3");
     auto query = [&]() {
-      auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+      auto result = cudf::get_json_object(cudf::strings_column_view(input), json_path);
     };
     EXPECT_THROW(query(), std::invalid_argument);
   }
@@ -596,7 +596,7 @@ TEST_F(JsonPathTests, GetJsonObjectInvalidQuery)
   {
     cudf::test::strings_column_wrapper input{R"({"a": "b"})"};
     std::string json_path("$[*].c");
-    auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+    auto result = cudf::get_json_object(cudf::strings_column_view(input), json_path);
 
     cudf::test::strings_column_wrapper expected({""}, {0});
 
@@ -607,7 +607,7 @@ TEST_F(JsonPathTests, GetJsonObjectInvalidQuery)
   {
     cudf::test::strings_column_wrapper input{R"({"a": "b"})"};
     std::string json_path("$[*].c[2]");
-    auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+    auto result = cudf::get_json_object(cudf::strings_column_view(input), json_path);
 
     cudf::test::strings_column_wrapper expected({""}, {0});
 
@@ -618,7 +618,7 @@ TEST_F(JsonPathTests, GetJsonObjectInvalidQuery)
   {
     cudf::test::strings_column_wrapper input{json_string};
     std::string json_path("$.store.book.price");
-    auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+    auto result = cudf::get_json_object(cudf::strings_column_view(input), json_path);
 
     cudf::test::strings_column_wrapper expected({""}, {0});
 
@@ -629,7 +629,7 @@ TEST_F(JsonPathTests, GetJsonObjectInvalidQuery)
   {
     cudf::test::strings_column_wrapper input{json_string};
     std::string json_path("$.store.book[4]");
-    auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+    auto result = cudf::get_json_object(cudf::strings_column_view(input), json_path);
 
     cudf::test::strings_column_wrapper expected({""}, {0});
 
@@ -672,7 +672,7 @@ TEST_F(JsonPathTests, MixedOutput)
   cudf::test::strings_column_wrapper input(input_strings.begin(), input_strings.end());
   {
     std::string json_path("$.a");
-    auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+    auto result = cudf::get_json_object(cudf::strings_column_view(input), json_path);
 
     // clang-format off
     cudf::test::strings_column_wrapper expected({
@@ -694,7 +694,7 @@ TEST_F(JsonPathTests, MixedOutput)
 
   {
     std::string json_path("$.a[1]");
-    auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+    auto result = cudf::get_json_object(cudf::strings_column_view(input), json_path);
 
     // clang-format off
     cudf::test::strings_column_wrapper expected({
@@ -713,7 +713,7 @@ TEST_F(JsonPathTests, MixedOutput)
 
   {
     std::string json_path("$.a.b");
-    auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+    auto result = cudf::get_json_object(cudf::strings_column_view(input), json_path);
 
     // clang-format off
     cudf::test::strings_column_wrapper expected({
@@ -731,7 +731,7 @@ TEST_F(JsonPathTests, MixedOutput)
 
   {
     std::string json_path("$.a[*]");
-    auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+    auto result = cudf::get_json_object(cudf::strings_column_view(input), json_path);
 
     // clang-format off
     cudf::test::strings_column_wrapper expected({
@@ -752,7 +752,7 @@ TEST_F(JsonPathTests, MixedOutput)
 
   {
     std::string json_path("$.a.b[*]");
-    auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+    auto result = cudf::get_json_object(cudf::strings_column_view(input), json_path);
 
     // clang-format off
     cudf::test::strings_column_wrapper expected({
@@ -779,13 +779,12 @@ TEST_F(JsonPathTests, StripQuotes)
     std::string str("{\"a\" : \"b\"}");
     cudf::test::strings_column_wrapper input({str, str});
 
-    cudf::strings::get_json_object_options options;
+    cudf::get_json_object_options options;
     options.set_strip_quotes_from_single_strings(false);
 
     std::string json_path("$.a");
-    auto result_raw =
-      cudf::strings::get_json_object(cudf::strings_column_view(input), json_path, options);
-    auto result = drop_whitespace(*result_raw);
+    auto result_raw = cudf::get_json_object(cudf::strings_column_view(input), json_path, options);
+    auto result     = drop_whitespace(*result_raw);
 
     cudf::test::strings_column_wrapper expected_raw({"\"b\"", "\"b\""});
     auto expected = drop_whitespace(expected_raw);
@@ -798,11 +797,10 @@ TEST_F(JsonPathTests, StripQuotes)
     cudf::test::strings_column_wrapper input{R"({"store": { "bicycle" : "" } })"};
     std::string json_path("$.store.bicycle");
 
-    cudf::strings::get_json_object_options options;
+    cudf::get_json_object_options options;
     options.set_strip_quotes_from_single_strings(true);
 
-    auto result =
-      cudf::strings::get_json_object(cudf::strings_column_view(input), json_path, options);
+    auto result = cudf::get_json_object(cudf::strings_column_view(input), json_path, options);
 
     cudf::test::strings_column_wrapper expected({""});
 
@@ -859,11 +857,10 @@ TEST_F(JsonPathTests, AllowSingleQuotes)
   {
     std::string json_path("$.a");
 
-    cudf::strings::get_json_object_options options;
+    cudf::get_json_object_options options;
     options.set_allow_single_quotes(true);
 
-    auto result =
-      cudf::strings::get_json_object(cudf::strings_column_view(input), json_path, options);
+    auto result = cudf::get_json_object(cudf::strings_column_view(input), json_path, options);
 
     // clang-format off
     cudf::test::strings_column_wrapper expected({
@@ -903,11 +900,10 @@ TEST_F(JsonPathTests, StringsWithSpecialChars)
     {
       std::string json_path("$.item");
 
-      cudf::strings::get_json_object_options options;
+      cudf::get_json_object_options options;
       options.set_allow_single_quotes(true);
 
-      auto result =
-        cudf::strings::get_json_object(cudf::strings_column_view(input), json_path, options);
+      auto result = cudf::get_json_object(cudf::strings_column_view(input), json_path, options);
 
       // clang-format off
       cudf::test::strings_column_wrapper expected({
@@ -929,11 +925,10 @@ TEST_F(JsonPathTests, StringsWithSpecialChars)
     {
       std::string json_path("$.a");
 
-      cudf::strings::get_json_object_options options;
+      cudf::get_json_object_options options;
       options.set_allow_single_quotes(true);
 
-      auto result =
-        cudf::strings::get_json_object(cudf::strings_column_view(input), json_path, options);
+      auto result = cudf::get_json_object(cudf::strings_column_view(input), json_path, options);
 
       // clang-format off
       cudf::test::strings_column_wrapper expected({
@@ -962,11 +957,10 @@ TEST_F(JsonPathTests, EscapeSequences)
   {
     std::string json_path("$.a");
 
-    cudf::strings::get_json_object_options options;
+    cudf::get_json_object_options options;
     options.set_allow_single_quotes(true);
 
-    auto result =
-      cudf::strings::get_json_object(cudf::strings_column_view(input), json_path, options);
+    auto result = cudf::get_json_object(cudf::strings_column_view(input), json_path, options);
 
     // clang-format off
     cudf::test::strings_column_wrapper expected({
@@ -998,12 +992,12 @@ TEST_F(JsonPathTests, MissingFieldsAsNulls)
                                  auto const& missing_fields_output,
                                  bool default_valid = true) {
     cudf::test::strings_column_wrapper input{input_string};
-    cudf::strings::get_json_object_options options;
+    cudf::get_json_object_options options;
 
     // Test default behavior
     options.set_missing_fields_as_nulls(false);
     auto const default_result =
-      cudf::strings::get_json_object(cudf::strings_column_view(input), {json_path_string}, options);
+      cudf::get_json_object(cudf::strings_column_view(input), {json_path_string}, options);
     cudf::test::strings_column_wrapper default_expected({default_output}, {default_valid});
 
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(default_expected, *default_result);
@@ -1011,7 +1005,7 @@ TEST_F(JsonPathTests, MissingFieldsAsNulls)
     // Test with missing fields as null
     options.set_missing_fields_as_nulls(true);
     auto const missing_fields_result =
-      cudf::strings::get_json_object(cudf::strings_column_view(input), {json_path_string}, options);
+      cudf::get_json_object(cudf::strings_column_view(input), {json_path_string}, options);
     cudf::test::strings_column_wrapper missing_fields_expected({missing_fields_output}, {1});
 
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(missing_fields_expected, *missing_fields_result);
diff --git a/cpp/tests/merge/merge_test.cpp b/cpp/tests/merge/merge_test.cpp
index 3a61c0768a6..3558e5676dd 100644
--- a/cpp/tests/merge/merge_test.cpp
+++ b/cpp/tests/merge/merge_test.cpp
@@ -27,7 +27,9 @@
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/cudf_gtest.hpp>
+#include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/table_utilities.hpp>
+#include <cudf_test/type_list_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
@@ -874,6 +876,117 @@ TEST_F(MergeTest, StructsNestedWithNulls)
   // clang-format on
 }
 
+using lcw = cudf::test::lists_column_wrapper<int32_t>;
+using cudf::test::iterators::null_at;
+using cudf::test::iterators::nulls_at;
+
+TEST_F(MergeTest, Lists)
+{
+  auto col1 = lcw{lcw{1}, lcw{3}, lcw{5}, lcw{7}};
+  auto col2 = lcw{lcw{2}, lcw{4}, lcw{6}, lcw{8}};
+
+  auto tbl1 = cudf::table_view{{col1}};
+  auto tbl2 = cudf::table_view{{col2}};
+
+  auto result = cudf::merge({tbl1, tbl2}, {0}, {cudf::order::ASCENDING});
+
+  auto expected_col = lcw{lcw{1}, lcw{2}, lcw{3}, lcw{4}, lcw{5}, lcw{6}, lcw{7}, lcw{8}};
+  auto expected_tbl = cudf::table_view{{expected_col}};
+
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected_tbl, *result);
+}
+
+TEST_F(MergeTest, NestedListsWithNulls)
+{
+  auto col1 = lcw{{lcw{lcw{1}}, lcw{lcw{3}}, lcw{lcw{5}}, lcw{lcw{7}}}, null_at(3)};
+  auto col2 = lcw{{lcw{lcw{2}}, lcw{lcw{4}}, lcw{lcw{6}}, lcw{lcw{8}}}, null_at(3)};
+
+  auto tbl1 = cudf::table_view{{col1}};
+  auto tbl2 = cudf::table_view{{col2}};
+
+  auto result = cudf::merge({tbl1, tbl2}, {0}, {cudf::order::ASCENDING}, {cudf::null_order::AFTER});
+
+  auto expected_col = lcw{{lcw{lcw{1}},
+                           lcw{lcw{2}},
+                           lcw{lcw{3}},
+                           lcw{lcw{4}},
+                           lcw{lcw{5}},
+                           lcw{lcw{6}},
+                           lcw{lcw{7}},
+                           lcw{lcw{8}}},
+                          nulls_at({6, 7})};
+  auto expected_tbl = cudf::table_view{{expected_col}};
+
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected_tbl, *result);
+}
+
+TEST_F(MergeTest, NestedListsofStructs)
+{
+  // [ {1},    {2},   {3} ]
+  // [ {5}                ]
+  // [ {7},    {8}        ]
+  // [ {10}               ]
+  auto const col1 = [] {
+    auto const get_structs = [] {
+      auto child0 = cudf::test::fixed_width_column_wrapper<int32_t>{1, 2, 3, 5, 7, 8, 10};
+      return cudf::test::structs_column_wrapper{{child0}};
+    };
+    return cudf::make_lists_column(
+      4,
+      cudf::test::fixed_width_column_wrapper<int32_t>{0, 3, 4, 6, 7}.release(),
+      get_structs().release(),
+      0,
+      {});
+  }();
+
+  // [ {4}                ]
+  // [ {6}                ]
+  // [ {9}                ]
+  // [ {11}               ]
+  auto const col2 = [] {
+    auto const get_structs = [] {
+      auto child0 = cudf::test::fixed_width_column_wrapper<int32_t>{4, 6, 9, 11};
+      return cudf::test::structs_column_wrapper{{child0}};
+    };
+    return cudf::make_lists_column(
+      4,
+      cudf::test::fixed_width_column_wrapper<int32_t>{0, 1, 2, 3, 4}.release(),
+      get_structs().release(),
+      0,
+      {});
+  }();
+
+  auto tbl1 = cudf::table_view{{*col1}};
+  auto tbl2 = cudf::table_view{{*col2}};
+
+  auto result = cudf::merge({tbl1, tbl2}, {0}, {cudf::order::ASCENDING}, {cudf::null_order::AFTER});
+
+  // [ {1},    {2},   {3} ]
+  // [ {4}                ]
+  // [ {5}                ]
+  // [ {6}                ]
+  // [ {7},    {8}        ]
+  // [ {9}                ]
+  // [ {10}               ]
+  // [ {11}               ]
+  auto const expected_col = [] {
+    auto const get_structs = [] {
+      auto child0 =
+        cudf::test::fixed_width_column_wrapper<int32_t>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
+      return cudf::test::structs_column_wrapper{{child0}};
+    };
+    return cudf::make_lists_column(
+      8,
+      cudf::test::fixed_width_column_wrapper<int32_t>{0, 3, 4, 5, 6, 8, 9, 10, 11}.release(),
+      get_structs().release(),
+      0,
+      {});
+  }();
+  auto expected_tbl = cudf::table_view{{*expected_col}};
+
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected_tbl, *result);
+}
+
 template <typename T>
 struct FixedPointTestAllReps : public cudf::test::BaseFixture {};
 
diff --git a/cpp/tests/streams/binaryop_test.cpp b/cpp/tests/streams/binaryop_test.cpp
new file mode 100644
index 00000000000..2520aed0458
--- /dev/null
+++ b/cpp/tests/streams/binaryop_test.cpp
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <tests/binaryop/util/runtime_support.h>
+
+#include <cudf/binaryop.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/scalar/scalar.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+
+class BinaryopTest : public cudf::test::BaseFixture {};
+
+TEST_F(BinaryopTest, ColumnColumn)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> lhs{10, 20, 30, 40, 50};
+  cudf::test::fixed_width_column_wrapper<int32_t> rhs{15, 25, 35, 45, 55};
+
+  cudf::binary_operation(lhs,
+                         rhs,
+                         cudf::binary_operator::ADD,
+                         cudf::data_type(cudf::type_to_id<int32_t>()),
+                         cudf::test::get_default_stream());
+}
+
+TEST_F(BinaryopTest, ColumnScalar)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> lhs{10, 20, 30, 40, 50};
+  cudf::numeric_scalar<int32_t> rhs{23, true, cudf::test::get_default_stream()};
+
+  cudf::binary_operation(lhs,
+                         rhs,
+                         cudf::binary_operator::ADD,
+                         cudf::data_type(cudf::type_to_id<int32_t>()),
+                         cudf::test::get_default_stream());
+}
+
+TEST_F(BinaryopTest, ScalarColumn)
+{
+  cudf::numeric_scalar<int32_t> lhs{42, true, cudf::test::get_default_stream()};
+  cudf::test::fixed_width_column_wrapper<int32_t> rhs{15, 25, 35, 45, 55};
+
+  cudf::binary_operation(lhs,
+                         rhs,
+                         cudf::binary_operator::ADD,
+                         cudf::data_type(cudf::type_to_id<int32_t>()),
+                         cudf::test::get_default_stream());
+}
+
+class BinaryopPTXTest : public BinaryopTest {
+ protected:
+  void SetUp() override
+  {
+    if (!can_do_runtime_jit()) { GTEST_SKIP() << "Skipping tests that require 11.5 runtime"; }
+  }
+};
+
+TEST_F(BinaryopPTXTest, ColumnColumnPTX)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> lhs{10, 20, 30, 40, 50};
+  cudf::test::fixed_width_column_wrapper<int64_t> rhs{15, 25, 35, 45, 55};
+
+  // c = a*a*a + b*b
+  char const* ptx =
+    R"***(
+//
+// Generated by NVIDIA NVVM Compiler
+//
+// Compiler Build ID: CL-24817639
+// Cuda compilation tools, release 10.0, V10.0.130
+// Based on LLVM 3.4svn
+//
+
+.version 6.3
+.target sm_70
+.address_size 64
+
+	// .globl	_ZN8__main__7add$241Eix
+.common .global .align 8 .u64 _ZN08NumbaEnv8__main__7add$241Eix;
+.common .global .align 8 .u64 _ZN08NumbaEnv5numba7targets7numbers14int_power_impl12$3clocals$3e13int_power$242Exx;
+
+.visible .func  (.param .b32 func_retval0) _ZN8__main__7add$241Eix(
+	.param .b64 _ZN8__main__7add$241Eix_param_0,
+	.param .b32 _ZN8__main__7add$241Eix_param_1,
+	.param .b64 _ZN8__main__7add$241Eix_param_2
+)
+{
+	.reg .b32 	%r<3>;
+	.reg .b64 	%rd<8>;
+
+
+	ld.param.u64 	%rd1, [_ZN8__main__7add$241Eix_param_0];
+	ld.param.u32 	%r1, [_ZN8__main__7add$241Eix_param_1];
+	ld.param.u64 	%rd2, [_ZN8__main__7add$241Eix_param_2];
+	cvt.s64.s32	%rd3, %r1;
+	mul.wide.s32 	%rd4, %r1, %r1;
+	mul.lo.s64 	%rd5, %rd4, %rd3;
+	mul.lo.s64 	%rd6, %rd2, %rd2;
+	add.s64 	%rd7, %rd6, %rd5;
+	st.u64 	[%rd1], %rd7;
+	mov.u32 	%r2, 0;
+	st.param.b32	[func_retval0+0], %r2;
+	ret;
+}
+
+)***";
+
+  cudf::binary_operation(
+    lhs, rhs, ptx, cudf::data_type(cudf::type_to_id<int32_t>()), cudf::test::get_default_stream());
+  cudf::binary_operation(lhs, rhs, ptx, cudf::data_type(cudf::type_to_id<int64_t>()));
+}
diff --git a/cpp/tests/streams/io/csv_test.cpp b/cpp/tests/streams/io/csv_test.cpp
new file mode 100644
index 00000000000..88514fa412c
--- /dev/null
+++ b/cpp/tests/streams/io/csv_test.cpp
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/io/csv.hpp>
+#include <cudf/io/detail/csv.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+#include <cudf_test/iterator_utilities.hpp>
+
+#include <string>
+#include <vector>
+
+auto const temp_env = static_cast<cudf::test::TempDirTestEnvironment*>(
+  ::testing::AddGlobalTestEnvironment(new cudf::test::TempDirTestEnvironment));
+
+class CSVTest : public cudf::test::BaseFixture {};
+
+TEST_F(CSVTest, CSVWriter)
+{
+  constexpr auto num_rows = 10;
+
+  std::vector<size_t> zeros(num_rows, 0);
+  std::vector<size_t> ones(num_rows, 1);
+  auto col6_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
+    return numeric::decimal128{ones[i], numeric::scale_type{12}};
+  });
+  auto col7_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
+    return numeric::decimal128{ones[i], numeric::scale_type{-12}};
+  });
+
+  cudf::test::fixed_width_column_wrapper<bool> col0(zeros.begin(), zeros.end());
+  cudf::test::fixed_width_column_wrapper<int8_t> col1(zeros.begin(), zeros.end());
+  cudf::test::fixed_width_column_wrapper<int16_t> col2(zeros.begin(), zeros.end());
+  cudf::test::fixed_width_column_wrapper<int32_t> col3(zeros.begin(), zeros.end());
+  cudf::test::fixed_width_column_wrapper<float> col4(zeros.begin(), zeros.end());
+  cudf::test::fixed_width_column_wrapper<double> col5(zeros.begin(), zeros.end());
+  cudf::test::fixed_width_column_wrapper<numeric::decimal128> col6(col6_data, col6_data + num_rows);
+  cudf::test::fixed_width_column_wrapper<numeric::decimal128> col7(col7_data, col7_data + num_rows);
+
+  std::vector<std::string> col8_data(num_rows, "rapids");
+  cudf::test::strings_column_wrapper col8(col8_data.begin(), col8_data.end());
+
+  cudf::table_view tab({col0, col1, col2, col3, col4, col5, col6, col7, col8});
+
+  auto const filepath = temp_env->get_temp_dir() + "multicolumn.csv";
+  auto w_options      = cudf::io::csv_writer_options::builder(cudf::io::sink_info{filepath}, tab)
+                     .include_header(false)
+                     .inter_column_delimiter(',');
+  cudf::io::write_csv(w_options.build(), cudf::test::get_default_stream());
+}
+
+TEST_F(CSVTest, CSVReader)
+{
+  constexpr auto num_rows = 10;
+
+  std::vector<size_t> zeros(num_rows, 0);
+  std::vector<size_t> ones(num_rows, 1);
+  auto col6_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
+    return numeric::decimal128{ones[i], numeric::scale_type{12}};
+  });
+  auto col7_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
+    return numeric::decimal128{ones[i], numeric::scale_type{-12}};
+  });
+
+  cudf::test::fixed_width_column_wrapper<bool> col0(zeros.begin(), zeros.end());
+  cudf::test::fixed_width_column_wrapper<int8_t> col1(zeros.begin(), zeros.end());
+  cudf::test::fixed_width_column_wrapper<int16_t> col2(zeros.begin(), zeros.end());
+  cudf::test::fixed_width_column_wrapper<int32_t> col3(zeros.begin(), zeros.end());
+  cudf::test::fixed_width_column_wrapper<float> col4(zeros.begin(), zeros.end());
+  cudf::test::fixed_width_column_wrapper<double> col5(zeros.begin(), zeros.end());
+  cudf::test::fixed_width_column_wrapper<numeric::decimal128> col6(col6_data, col6_data + num_rows);
+  cudf::test::fixed_width_column_wrapper<numeric::decimal128> col7(col7_data, col7_data + num_rows);
+
+  std::vector<std::string> col8_data(num_rows, "rapids");
+  cudf::test::strings_column_wrapper col8(col8_data.begin(), col8_data.end());
+
+  cudf::table_view tab({col0, col1, col2, col3, col4, col5, col6, col7, col8});
+
+  auto const filepath = temp_env->get_temp_dir() + "multicolumn.csv";
+  auto w_options      = cudf::io::csv_writer_options::builder(cudf::io::sink_info{filepath}, tab)
+                     .include_header(false)
+                     .inter_column_delimiter(',');
+  cudf::io::write_csv(w_options.build(), cudf::test::get_default_stream());
+}
diff --git a/cpp/tests/streams/io/json_test.cpp b/cpp/tests/streams/io/json_test.cpp
new file mode 100644
index 00000000000..80619d4d58c
--- /dev/null
+++ b/cpp/tests/streams/io/json_test.cpp
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/io/detail/json.hpp>
+#include <cudf/io/json.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+#include <cudf_test/iterator_utilities.hpp>
+
+#include <string>
+#include <vector>
+
+class JSONTest : public cudf::test::BaseFixture {};
+
+TEST_F(JSONTest, JSONreader)
+{
+  std::string data = "[1, 1.1]\n[2, 2.2]\n[3, 3.3]\n";
+  cudf::io::json_reader_options in_options =
+    cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()})
+      .dtypes(std::vector<cudf::data_type>{cudf::data_type{cudf::type_id::INT32},
+                                           cudf::data_type{cudf::type_id::FLOAT64}})
+      .lines(true)
+      .legacy(true);
+  cudf::io::table_with_metadata result =
+    cudf::io::read_json(in_options, cudf::test::get_default_stream());
+}
+
+TEST_F(JSONTest, JSONwriter)
+{
+  cudf::test::strings_column_wrapper col1{"a", "b", "c"};
+  cudf::test::strings_column_wrapper col2{"d", "e", "f"};
+  cudf::test::fixed_width_column_wrapper<int> col3{1, 2, 3};
+  cudf::test::fixed_width_column_wrapper<float> col4{1.5, 2.5, 3.5};
+  cudf::test::fixed_width_column_wrapper<int16_t> col5{{1, 2, 3},
+                                                       cudf::test::iterators::nulls_at({0, 2})};
+  cudf::table_view tbl_view{{col1, col2, col3, col4, col5}};
+  cudf::io::table_metadata mt{{{"col1"}, {"col2"}, {"int"}, {"float"}, {"int16"}}};
+
+  std::vector<char> out_buffer;
+  auto destination     = cudf::io::sink_info(&out_buffer);
+  auto options_builder = cudf::io::json_writer_options_builder(destination, tbl_view)
+                           .include_nulls(true)
+                           .metadata(mt)
+                           .lines(false)
+                           .na_rep("null");
+
+  cudf::io::write_json(options_builder.build(), cudf::test::get_default_stream());
+}
diff --git a/cpp/tests/streams/lists_test.cpp b/cpp/tests/streams/lists_test.cpp
new file mode 100644
index 00000000000..74e0e8837f7
--- /dev/null
+++ b/cpp/tests/streams/lists_test.cpp
@@ -0,0 +1,213 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+
+#include <cudf/lists/combine.hpp>
+#include <cudf/lists/contains.hpp>
+#include <cudf/lists/count_elements.hpp>
+#include <cudf/lists/extract.hpp>
+#include <cudf/lists/filling.hpp>
+#include <cudf/lists/gather.hpp>
+#include <cudf/lists/reverse.hpp>
+#include <cudf/lists/set_operations.hpp>
+#include <cudf/lists/sorting.hpp>
+#include <cudf/lists/stream_compaction.hpp>
+
+class ListTest : public cudf::test::BaseFixture {};
+
+TEST_F(ListTest, ConcatenateRows)
+{
+  cudf::test::lists_column_wrapper<int> list_col_1{{0, 1}, {2, 3}, {4, 5}};
+  cudf::test::lists_column_wrapper<int> list_col_2{{0, 1}, {2, 3}, {4, 5}};
+  cudf::table_view lists_table({list_col_1, list_col_2});
+  cudf::lists::concatenate_rows(
+    lists_table, cudf::lists::concatenate_null_policy::IGNORE, cudf::test::get_default_stream());
+}
+
+TEST_F(ListTest, ConcatenateListElements)
+{
+  cudf::test::lists_column_wrapper<int> ll_column{{{0, 1}, {2, 3}}, {{4, 5}, {6, 7}}};
+  cudf::lists::concatenate_list_elements(
+    ll_column, cudf::lists::concatenate_null_policy::IGNORE, cudf::test::get_default_stream());
+}
+
+TEST_F(ListTest, ContainsNulls)
+{
+  cudf::test::lists_column_wrapper<int> list_col{{0, 1}, {2, 3}, {4, 5}};
+  cudf::lists::contains_nulls(list_col, cudf::test::get_default_stream());
+}
+
+TEST_F(ListTest, ContainsSearchKey)
+{
+  cudf::test::lists_column_wrapper<int> list_col{{0, 1}, {2, 3}, {4, 5}};
+  cudf::numeric_scalar<int32_t> search_key(2, true, cudf::test::get_default_stream());
+  cudf::lists::contains(list_col, search_key, cudf::test::get_default_stream());
+}
+
+TEST_F(ListTest, ContainsSearchKeys)
+{
+  cudf::test::lists_column_wrapper<int> list_col{{0, 1}, {2, 3}, {4, 5}};
+  cudf::test::fixed_width_column_wrapper<int> search_keys({1, 2, 3});
+  cudf::lists::contains(list_col, search_keys, cudf::test::get_default_stream());
+}
+
+TEST_F(ListTest, IndexOfSearchKey)
+{
+  cudf::test::lists_column_wrapper<int> list_col{{0, 1}, {2, 3}, {4, 5}};
+  cudf::numeric_scalar<int32_t> search_key(2, true, cudf::test::get_default_stream());
+  cudf::lists::index_of(list_col,
+                        search_key,
+                        cudf::lists::duplicate_find_option::FIND_FIRST,
+                        cudf::test::get_default_stream());
+}
+
+TEST_F(ListTest, IndexOfSearchKeys)
+{
+  cudf::test::lists_column_wrapper<int> list_col{{0, 1}, {2, 3}, {4, 5}};
+  cudf::test::fixed_width_column_wrapper<int> search_keys({1, 2, 3});
+  cudf::lists::index_of(list_col,
+                        search_keys,
+                        cudf::lists::duplicate_find_option::FIND_FIRST,
+                        cudf::test::get_default_stream());
+}
+
+TEST_F(ListTest, CountElements)
+{
+  cudf::test::lists_column_wrapper<int> list_col{{0, 1}, {2, 3, 7}, {4, 5}};
+  cudf::lists::count_elements(list_col, cudf::test::get_default_stream());
+}
+
+TEST_F(ListTest, ExtractListElementFromIndex)
+{
+  cudf::test::lists_column_wrapper<int> list_col{{0, 1}, {2, 3, 7}, {4, 5}};
+  cudf::lists::extract_list_element(list_col, -1, cudf::test::get_default_stream());
+}
+
+TEST_F(ListTest, ExtractListElementFromIndices)
+{
+  cudf::test::lists_column_wrapper<int> list_col{{0, 1}, {2, 3, 7}, {4, 5}};
+  cudf::test::fixed_width_column_wrapper<int> indices({-1, -2, -1});
+  cudf::lists::extract_list_element(list_col, indices, cudf::test::get_default_stream());
+}
+
+TEST_F(ListTest, SegmentedGather)
+{
+  cudf::test::lists_column_wrapper<int> list_col{{0, 1}, {2, 3, 7, 8}, {4, 5}};
+  cudf::test::lists_column_wrapper<int> gather_map_list{{0}, {1, 2}, {1}};
+  cudf::lists::segmented_gather(list_col,
+                                gather_map_list,
+                                cudf::out_of_bounds_policy::DONT_CHECK,
+                                cudf::test::get_default_stream());
+}
+
+TEST_F(ListTest, Sequences)
+{
+  cudf::test::fixed_width_column_wrapper<int> starts({0, 1, 2, 3, 4});
+  cudf::test::fixed_width_column_wrapper<int> sizes({0, 1, 2, 2, 1});
+  cudf::lists::sequences(starts, sizes, cudf::test::get_default_stream());
+}
+
+TEST_F(ListTest, SequencesWithSteps)
+{
+  cudf::test::fixed_width_column_wrapper<int> starts({0, 1, 2, 3, 4});
+  cudf::test::fixed_width_column_wrapper<int> steps({2, 1, 1, 1, -3});
+  cudf::test::fixed_width_column_wrapper<int> sizes({0, 1, 2, 2, 1});
+  cudf::lists::sequences(starts, steps, sizes, cudf::test::get_default_stream());
+}
+
+TEST_F(ListTest, Reverse)
+{
+  cudf::test::lists_column_wrapper<int> list_col{{0, 1}, {2, 3, 7, 8}, {4, 5}};
+  cudf::lists::reverse(list_col, cudf::test::get_default_stream());
+}
+
+TEST_F(ListTest, SortLists)
+{
+  cudf::test::lists_column_wrapper<int> list_col{{0, 1}, {2, 3, 7, 8}, {4, 5}};
+  cudf::lists::sort_lists(
+    list_col, cudf::order::DESCENDING, cudf::null_order::AFTER, cudf::test::get_default_stream());
+}
+
+TEST_F(ListTest, StableSortLists)
+{
+  cudf::test::lists_column_wrapper<int> list_col{{0, 1}, {2, 3, 7, 8}, {4, 5}};
+  cudf::lists::stable_sort_lists(
+    list_col, cudf::order::DESCENDING, cudf::null_order::AFTER, cudf::test::get_default_stream());
+}
+
+TEST_F(ListTest, ApplyBooleanMask)
+{
+  cudf::test::lists_column_wrapper<int> list_col{{0, 1}, {2, 3, 7, 8}, {4, 5}};
+  cudf::test::lists_column_wrapper<bool> boolean_mask{{0, 1}, {1, 1, 1, 0}, {0, 1}};
+  cudf::lists::apply_boolean_mask(list_col, boolean_mask, cudf::test::get_default_stream());
+}
+
+TEST_F(ListTest, Distinct)
+{
+  cudf::test::lists_column_wrapper<int> list_col{{0, 1}, {2, 3, 7, 8}, {4, 5}};
+  cudf::test::lists_column_wrapper<int> boolean_mask{{0, 1}, {1, 1, 1, 0}, {0, 1}};
+  cudf::lists::distinct(list_col,
+                        cudf::null_equality::EQUAL,
+                        cudf::nan_equality::ALL_EQUAL,
+                        cudf::test::get_default_stream());
+}
+
+TEST_F(ListTest, DifferenceDistinct)
+{
+  cudf::test::lists_column_wrapper<int> list_col_a{{0, 1}, {2, 3, 7, 8}, {4, 5}};
+  cudf::test::lists_column_wrapper<int> list_col_b{{0, 1}, {1, 3, 6, 8}, {5}};
+  cudf::lists::difference_distinct(list_col_a,
+                                   list_col_b,
+                                   cudf::null_equality::EQUAL,
+                                   cudf::nan_equality::ALL_EQUAL,
+                                   cudf::test::get_default_stream());
+}
+
+TEST_F(ListTest, IntersectDistinct)
+{
+  cudf::test::lists_column_wrapper<int> list_col_a{{0, 1}, {2, 3, 7, 8}, {4, 5}};
+  cudf::test::lists_column_wrapper<int> list_col_b{{0, 1}, {1, 3, 6, 8}, {5}};
+  cudf::lists::intersect_distinct(list_col_a,
+                                  list_col_b,
+                                  cudf::null_equality::EQUAL,
+                                  cudf::nan_equality::ALL_EQUAL,
+                                  cudf::test::get_default_stream());
+}
+
+TEST_F(ListTest, UnionDistinct)
+{
+  cudf::test::lists_column_wrapper<int> list_col_a{{0, 1}, {2, 3, 7, 8}, {4, 5}};
+  cudf::test::lists_column_wrapper<int> list_col_b{{0, 1}, {1, 3, 6, 8}, {5}};
+  cudf::lists::union_distinct(list_col_a,
+                              list_col_b,
+                              cudf::null_equality::EQUAL,
+                              cudf::nan_equality::ALL_EQUAL,
+                              cudf::test::get_default_stream());
+}
+
+TEST_F(ListTest, HaveOverlap)
+{
+  cudf::test::lists_column_wrapper<int> list_col_a{{0, 1}, {2, 3, 7, 8}, {4, 5}};
+  cudf::test::lists_column_wrapper<int> list_col_b{{0, 1}, {1, 3, 6, 8}, {5}};
+  cudf::lists::have_overlap(list_col_a,
+                            list_col_b,
+                            cudf::null_equality::EQUAL,
+                            cudf::nan_equality::ALL_EQUAL,
+                            cudf::test::get_default_stream());
+}
diff --git a/cpp/tests/streams/null_mask_test.cpp b/cpp/tests/streams/null_mask_test.cpp
new file mode 100644
index 00000000000..7e59201c8cf
--- /dev/null
+++ b/cpp/tests/streams/null_mask_test.cpp
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <tests/binaryop/util/runtime_support.h>
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/null_mask.hpp>
+#include <cudf/scalar/scalar.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+
+class NullMaskTest : public cudf::test::BaseFixture {};
+
+TEST_F(NullMaskTest, CreateNullMask)
+{
+  cudf::create_null_mask(10, cudf::mask_state::ALL_VALID, cudf::test::get_default_stream());
+}
+
+TEST_F(NullMaskTest, SetNullMask)
+{
+  cudf::test::fixed_width_column_wrapper<bool> col({0, 1, 0, 1, 1},
+                                                   {true, false, true, false, false});
+
+  cudf::set_null_mask(static_cast<cudf::mutable_column_view>(col).null_mask(),
+                      0,
+                      3,
+                      false,
+                      cudf::test::get_default_stream());
+}
+
+TEST_F(NullMaskTest, CopyBitmask)
+{
+  cudf::test::fixed_width_column_wrapper<bool> const col({0, 1, 0, 1, 1},
+                                                         {true, false, true, false, false});
+
+  cudf::copy_bitmask(
+    static_cast<cudf::column_view>(col).null_mask(), 0, 3, cudf::test::get_default_stream());
+}
+
+TEST_F(NullMaskTest, CopyBitmaskFromColumn)
+{
+  cudf::test::fixed_width_column_wrapper<bool> const col({0, 1, 0, 1, 1},
+                                                         {true, false, true, false, false});
+
+  cudf::copy_bitmask(col, cudf::test::get_default_stream());
+}
+
+TEST_F(NullMaskTest, BitMaskAnd)
+{
+  cudf::test::fixed_width_column_wrapper<bool> const col1({0, 1, 0, 1, 1},
+                                                          {true, false, true, false, false});
+  cudf::test::fixed_width_column_wrapper<bool> const col2({0, 1, 0, 1, 1},
+                                                          {true, true, false, false, true});
+
+  auto tbl = cudf::table_view{{col1, col2}};
+  cudf::bitmask_and(tbl, cudf::test::get_default_stream());
+}
+
+TEST_F(NullMaskTest, BitMaskOr)
+{
+  cudf::test::fixed_width_column_wrapper<bool> const col1({0, 1, 0, 1, 1},
+                                                          {true, false, true, false, false});
+  cudf::test::fixed_width_column_wrapper<bool> const col2({0, 1, 0, 1, 1},
+                                                          {true, true, false, false, true});
+
+  auto tbl = cudf::table_view{{col1, col2}};
+  cudf::bitmask_or(tbl, cudf::test::get_default_stream());
+}
+
+TEST_F(NullMaskTest, NullCount)
+{
+  cudf::test::fixed_width_column_wrapper<bool> const col({0, 1, 0, 1, 1},
+                                                         {true, true, false, false, true});
+
+  cudf::null_count(
+    static_cast<cudf::column_view>(col).null_mask(), 0, 4, cudf::test::get_default_stream());
+}
diff --git a/cpp/tests/streams/strings/combine_test.cpp b/cpp/tests/streams/strings/combine_test.cpp
new file mode 100644
index 00000000000..9562634957a
--- /dev/null
+++ b/cpp/tests/streams/strings/combine_test.cpp
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+
+#include <cudf/strings/combine.hpp>
+#include <cudf/strings/repeat_strings.hpp>
+
+#include <string>
+
+class StringsCombineTest : public cudf::test::BaseFixture {};
+
+TEST_F(StringsCombineTest, Concatenate)
+{
+  auto input = cudf::test::strings_column_wrapper({"Héllo", "thesé", "tést"});
+  auto view  = cudf::table_view({input, input});
+
+  auto separators      = cudf::test::strings_column_wrapper({"_", ".", " "});
+  auto separators_view = cudf::strings_column_view(separators);
+  auto sep_on_null     = cudf::strings::separator_on_nulls::YES;
+
+  auto const separator = cudf::string_scalar(" ", true, cudf::test::get_default_stream());
+  auto const narep     = cudf::string_scalar("n/a", true, cudf::test::get_default_stream());
+  cudf::strings::concatenate(view, separator, narep, sep_on_null, cudf::test::get_default_stream());
+  cudf::strings::concatenate(
+    view, separators_view, narep, narep, sep_on_null, cudf::test::get_default_stream());
+}
+
+TEST_F(StringsCombineTest, Join)
+{
+  auto input = cudf::test::strings_column_wrapper({"Héllo", "thesé", "tést"});
+  auto view  = cudf::strings_column_view(input);
+
+  auto const separator = cudf::string_scalar(" ", true, cudf::test::get_default_stream());
+  auto const narep     = cudf::string_scalar("n/a", true, cudf::test::get_default_stream());
+  cudf::strings::join_strings(view, separator, narep, cudf::test::get_default_stream());
+}
+
+TEST_F(StringsCombineTest, JoinLists)
+{
+  using STR_LISTS  = cudf::test::lists_column_wrapper<cudf::string_view>;
+  auto const input = STR_LISTS{
+    STR_LISTS{"a", "bb", "ccc"}, STR_LISTS{"ddd", "efgh", "ijk"}, STR_LISTS{"zzz", "xxxxx"}};
+  auto view = cudf::lists_column_view(input);
+
+  auto separators      = cudf::test::strings_column_wrapper({"_", ".", " "});
+  auto separators_view = cudf::strings_column_view(separators);
+  auto sep_on_null     = cudf::strings::separator_on_nulls::YES;
+  auto if_empty        = cudf::strings::output_if_empty_list::EMPTY_STRING;
+
+  auto const separator = cudf::string_scalar(" ", true, cudf::test::get_default_stream());
+  auto const narep     = cudf::string_scalar("n/a", true, cudf::test::get_default_stream());
+  cudf::strings::join_list_elements(
+    view, separator, narep, sep_on_null, if_empty, cudf::test::get_default_stream());
+  cudf::strings::join_list_elements(
+    view, separators_view, narep, narep, sep_on_null, if_empty, cudf::test::get_default_stream());
+}
+
+TEST_F(StringsCombineTest, Repeat)
+{
+  auto input = cudf::test::strings_column_wrapper({"Héllo", "thesé", "tést"});
+  auto view  = cudf::strings_column_view(input);
+  cudf::strings::repeat_strings(view, 0, cudf::test::get_default_stream());
+  cudf::strings::repeat_strings(view, 1, cudf::test::get_default_stream());
+  cudf::strings::repeat_strings(view, 10, cudf::test::get_default_stream());
+
+  auto counts = cudf::test::fixed_width_column_wrapper<cudf::size_type>({9, 8, 7});
+  cudf::strings::repeat_strings(view, counts, cudf::test::get_default_stream());
+  cudf::strings::repeat_strings(view, counts, cudf::test::get_default_stream());
+
+  auto const str = cudf::string_scalar("X", true, cudf::test::get_default_stream());
+  cudf::strings::repeat_string(str, 0, cudf::test::get_default_stream());
+  cudf::strings::repeat_string(str, 1, cudf::test::get_default_stream());
+  cudf::strings::repeat_string(str, 10, cudf::test::get_default_stream());
+
+  auto const invalid = cudf::string_scalar("", false, cudf::test::get_default_stream());
+  cudf::strings::repeat_string(invalid, 10, cudf::test::get_default_stream());
+}
diff --git a/cpp/tests/streams/strings/contains_test.cpp b/cpp/tests/streams/strings/contains_test.cpp
new file mode 100644
index 00000000000..383d48abe1e
--- /dev/null
+++ b/cpp/tests/streams/strings/contains_test.cpp
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+
+#include <cudf/strings/contains.hpp>
+#include <cudf/strings/regex/regex_program.hpp>
+
+#include <string>
+
+class StringsContainsTest : public cudf::test::BaseFixture {};
+
+TEST_F(StringsContainsTest, Contains)
+{
+  auto input = cudf::test::strings_column_wrapper({"Héllo", "thesé", "tést strings", ""});
+  auto view  = cudf::strings_column_view(input);
+
+  auto const pattern = std::string("[a-z]");
+  auto const prog    = cudf::strings::regex_program::create(pattern);
+  cudf::strings::contains_re(view, *prog, cudf::test::get_default_stream());
+  cudf::strings::matches_re(view, *prog, cudf::test::get_default_stream());
+  cudf::strings::count_re(view, *prog, cudf::test::get_default_stream());
+}
+
+TEST_F(StringsContainsTest, Like)
+{
+  auto input = cudf::test::strings_column_wrapper({"Héllo", "thesés", "tést", ""});
+  auto view  = cudf::strings_column_view(input);
+
+  auto const pattern = cudf::string_scalar("%és", true, cudf::test::get_default_stream());
+  auto const escape  = cudf::string_scalar("%", true, cudf::test::get_default_stream());
+  cudf::strings::like(view, pattern, escape, cudf::test::get_default_stream());
+
+  auto const patterns = cudf::test::strings_column_wrapper({"H%", "t%s", "t", ""});
+  cudf::strings::like(
+    view, cudf::strings_column_view(patterns), escape, cudf::test::get_default_stream());
+}
diff --git a/cpp/tests/streams/strings/convert_test.cpp b/cpp/tests/streams/strings/convert_test.cpp
new file mode 100644
index 00000000000..8dc3f625746
--- /dev/null
+++ b/cpp/tests/streams/strings/convert_test.cpp
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+
+#include <cudf/strings/convert/convert_booleans.hpp>
+#include <cudf/strings/convert/convert_datetime.hpp>
+#include <cudf/strings/convert/convert_durations.hpp>
+#include <cudf/strings/convert/convert_fixed_point.hpp>
+#include <cudf/strings/convert/convert_floats.hpp>
+#include <cudf/strings/convert/convert_integers.hpp>
+#include <cudf/strings/convert/convert_ipv4.hpp>
+#include <cudf/strings/convert/convert_lists.hpp>
+#include <cudf/strings/convert/convert_urls.hpp>
+
+#include <string>
+
+class StringsConvertTest : public cudf::test::BaseFixture {};
+
+TEST_F(StringsConvertTest, Booleans)
+{
+  auto input = cudf::test::strings_column_wrapper({"true", "false", "True", ""});
+  auto view  = cudf::strings_column_view(input);
+
+  auto true_scalar  = cudf::string_scalar("true", true, cudf::test::get_default_stream());
+  auto false_scalar = cudf::string_scalar("false", true, cudf::test::get_default_stream());
+
+  auto bools = cudf::strings::to_booleans(view, true_scalar, cudf::test::get_default_stream());
+  cudf::strings::from_booleans(
+    bools->view(), true_scalar, false_scalar, cudf::test::get_default_stream());
+}
+
+TEST_F(StringsConvertTest, Timestamps)
+{
+  auto input = cudf::test::strings_column_wrapper({"2019-03-20T12:34:56Z", "2020-02-29T00:00:00Z"});
+  auto view  = cudf::strings_column_view(input);
+
+  std::string format = "%Y-%m-%dT%H:%M:%SZ";
+  auto dtype         = cudf::data_type{cudf::type_id::TIMESTAMP_SECONDS};
+
+  cudf::strings::is_timestamp(view, format, cudf::test::get_default_stream());
+  auto timestamps =
+    cudf::strings::to_timestamps(view, dtype, format, cudf::test::get_default_stream());
+
+  auto empty = cudf::test::strings_column_wrapper();
+  cudf::strings::from_timestamps(
+    timestamps->view(), format, cudf::strings_column_view(empty), cudf::test::get_default_stream());
+}
+
+TEST_F(StringsConvertTest, Durations)
+{
+  auto input = cudf::test::strings_column_wrapper({"17975 days 12:34:56", "18321 days 00:00:00"});
+  auto view  = cudf::strings_column_view(input);
+
+  std::string format = "%D days %H:%M:%S";
+  auto dtype         = cudf::data_type{cudf::type_id::DURATION_SECONDS};
+
+  auto durations =
+    cudf::strings::to_durations(view, dtype, format, cudf::test::get_default_stream());
+  cudf::strings::from_durations(durations->view(), format, cudf::test::get_default_stream());
+}
+
+TEST_F(StringsConvertTest, FixedPoint)
+{
+  auto input = cudf::test::strings_column_wrapper({"1.234E3", "-876", "543.2"});
+  auto view  = cudf::strings_column_view(input);
+
+  auto dtype = cudf::data_type{cudf::type_id::DECIMAL64, numeric::scale_type{-3}};
+
+  auto values = cudf::strings::to_fixed_point(view, dtype, cudf::test::get_default_stream());
+  cudf::strings::from_fixed_point(values->view(), cudf::test::get_default_stream());
+}
+
+TEST_F(StringsConvertTest, Floats)
+{
+  auto input = cudf::test::strings_column_wrapper({"1.234E3", "-876", "543.2"});
+  auto view  = cudf::strings_column_view(input);
+
+  auto dtype = cudf::data_type{cudf::type_id::FLOAT32};
+
+  auto values = cudf::strings::to_floats(view, dtype, cudf::test::get_default_stream());
+  cudf::strings::from_floats(values->view(), cudf::test::get_default_stream());
+  cudf::strings::is_float(view, cudf::test::get_default_stream());
+}
+
+TEST_F(StringsConvertTest, Integers)
+{
+  auto input = cudf::test::strings_column_wrapper({"1234", "-876", "5432"});
+  auto view  = cudf::strings_column_view(input);
+
+  auto dtype = cudf::data_type{cudf::type_id::INT32};
+
+  auto values = cudf::strings::to_integers(view, dtype, cudf::test::get_default_stream());
+  cudf::strings::from_integers(values->view(), cudf::test::get_default_stream());
+  cudf::strings::is_integer(view, cudf::test::get_default_stream());
+  cudf::strings::is_hex(view, cudf::test::get_default_stream());
+  cudf::strings::hex_to_integers(view, dtype, cudf::test::get_default_stream());
+  cudf::strings::integers_to_hex(values->view(), cudf::test::get_default_stream());
+}
+
+TEST_F(StringsConvertTest, IPv4)
+{
+  auto input = cudf::test::strings_column_wrapper({"192.168.0.1", "10.0.0.1"});
+  auto view  = cudf::strings_column_view(input);
+
+  auto values = cudf::strings::ipv4_to_integers(view, cudf::test::get_default_stream());
+  cudf::strings::integers_to_ipv4(values->view(), cudf::test::get_default_stream());
+  cudf::strings::is_ipv4(view, cudf::test::get_default_stream());
+}
+
+TEST_F(StringsConvertTest, URLs)
+{
+  auto input = cudf::test::strings_column_wrapper({"www.nvidia.com/rapids?p=é", "/_file-7.txt"});
+  auto view  = cudf::strings_column_view(input);
+
+  auto values = cudf::strings::url_encode(view, cudf::test::get_default_stream());
+  cudf::strings::url_decode(values->view(), cudf::test::get_default_stream());
+}
+
+TEST_F(StringsConvertTest, ListsFormat)
+{
+  using STR_LISTS = cudf::test::lists_column_wrapper<cudf::string_view>;
+  auto const input =
+    STR_LISTS{{STR_LISTS{"a", "bb", "ccc"}, STR_LISTS{}, STR_LISTS{"ddd", "ee", "f"}},
+              {STR_LISTS{"gg", "hhh"}, STR_LISTS{"i", "", "", "jj"}}};
+  auto view        = cudf::lists_column_view(input);
+  auto null_scalar = cudf::string_scalar("NULL", true, cudf::test::get_default_stream());
+  auto separators  = cudf::strings_column_view(cudf::test::strings_column_wrapper());
+  cudf::strings::format_list_column(
+    view, null_scalar, separators, cudf::test::get_default_stream());
+}
diff --git a/cpp/tests/streams/strings/extract_test.cpp b/cpp/tests/streams/strings/extract_test.cpp
new file mode 100644
index 00000000000..06570fc5b38
--- /dev/null
+++ b/cpp/tests/streams/strings/extract_test.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+
+#include <cudf/strings/extract.hpp>
+#include <cudf/strings/regex/regex_program.hpp>
+
+#include <string>
+
+class StringsExtractTest : public cudf::test::BaseFixture {};
+
+TEST_F(StringsExtractTest, Extract)
+{
+  auto input = cudf::test::strings_column_wrapper({"Joe Schmoe", "John Smith", "Jane Smith"});
+  auto view  = cudf::strings_column_view(input);
+
+  auto const pattern = std::string("([A-Z][a-z]+)");
+  auto const prog    = cudf::strings::regex_program::create(pattern);
+  cudf::strings::extract(view, *prog, cudf::test::get_default_stream());
+  cudf::strings::extract_all_record(view, *prog, cudf::test::get_default_stream());
+}
diff --git a/cpp/tests/streams/strings/filter_test.cpp b/cpp/tests/streams/strings/filter_test.cpp
new file mode 100644
index 00000000000..3c44eb81380
--- /dev/null
+++ b/cpp/tests/streams/strings/filter_test.cpp
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/strings/char_types/char_types.hpp>
+#include <cudf/strings/translate.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+
+#include <string>
+#include <vector>
+
+class StringsFilterTest : public cudf::test::BaseFixture {};
+
+static std::pair<cudf::char_utf8, cudf::char_utf8> make_entry(char const* from, char const* to)
+{
+  cudf::char_utf8 in  = 0;
+  cudf::char_utf8 out = 0;
+  cudf::strings::detail::to_char_utf8(from, in);
+  if (to) cudf::strings::detail::to_char_utf8(to, out);
+  return std::pair(in, out);
+}
+
+TEST_F(StringsFilterTest, Translate)
+{
+  auto input = cudf::test::strings_column_wrapper({"  aBc  ", "   ", "aaaa ", "\tb"});
+  auto view  = cudf::strings_column_view(input);
+
+  std::vector<std::pair<cudf::char_utf8, cudf::char_utf8>> translate_table{
+    make_entry("b", 0), make_entry("a", "A"), make_entry(" ", "_")};
+  cudf::strings::translate(view, translate_table, cudf::test::get_default_stream());
+}
+
+TEST_F(StringsFilterTest, Filter)
+{
+  auto input = cudf::test::strings_column_wrapper({"  aBc  ", "   ", "aaaa ", "\tb"});
+  auto view  = cudf::strings_column_view(input);
+
+  std::vector<std::pair<cudf::char_utf8, cudf::char_utf8>> filter_table{
+    make_entry("b", 0), make_entry("a", "A"), make_entry(" ", "_")};
+
+  auto const repl = cudf::string_scalar("X", true, cudf::test::get_default_stream());
+  auto const keep = cudf::strings::filter_type::KEEP;
+  cudf::strings::filter_characters(
+    view, filter_table, keep, repl, cudf::test::get_default_stream());
+}
+
+TEST_F(StringsFilterTest, FilterTypes)
+{
+  auto input = cudf::test::strings_column_wrapper({"  aBc  ", "   ", "aaaa ", "\tb"});
+  auto view  = cudf::strings_column_view(input);
+
+  auto const verify_types =
+    cudf::strings::string_character_types::LOWER | cudf::strings::string_character_types::UPPER;
+  auto const all_types = cudf::strings::string_character_types::ALL_TYPES;
+  cudf::strings::all_characters_of_type(
+    view, verify_types, all_types, cudf::test::get_default_stream());
+
+  auto const repl        = cudf::string_scalar("X", true, cudf::test::get_default_stream());
+  auto const space_types = cudf::strings::string_character_types::SPACE;
+  cudf::strings::filter_characters_of_type(
+    view, all_types, repl, space_types, cudf::test::get_default_stream());
+}
diff --git a/cpp/tests/streams/strings/replace_test.cpp b/cpp/tests/streams/strings/replace_test.cpp
new file mode 100644
index 00000000000..fc87460b706
--- /dev/null
+++ b/cpp/tests/streams/strings/replace_test.cpp
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+
+#include <cudf/strings/regex/regex_program.hpp>
+#include <cudf/strings/replace.hpp>
+#include <cudf/strings/replace_re.hpp>
+
+#include <string>
+
+class StringsReplaceTest : public cudf::test::BaseFixture {};
+
+TEST_F(StringsReplaceTest, Replace)
+{
+  auto input = cudf::test::strings_column_wrapper({"Héllo", "thesé", "tést strings", ""});
+  auto view  = cudf::strings_column_view(input);
+
+  auto const target = cudf::string_scalar("é", true, cudf::test::get_default_stream());
+  auto const repl   = cudf::string_scalar(" ", true, cudf::test::get_default_stream());
+  cudf::strings::replace(view, target, repl, -1, cudf::test::get_default_stream());
+  cudf::strings::replace(view, view, view, cudf::test::get_default_stream());
+  cudf::strings::replace_slice(view, repl, 1, 2, cudf::test::get_default_stream());
+
+  auto const pattern = std::string("[a-z]");
+  auto const prog    = cudf::strings::regex_program::create(pattern);
+  cudf::strings::replace_re(view, *prog, repl, 1, cudf::test::get_default_stream());
+
+  cudf::test::strings_column_wrapper repls({"1", "a", " "});
+  cudf::strings::replace_re(view,
+                            {pattern, pattern, pattern},
+                            cudf::strings_column_view(repls),
+                            cudf::strings::regex_flags::DEFAULT,
+                            cudf::test::get_default_stream());
+}
+
+TEST_F(StringsReplaceTest, ReplaceRegex)
+{
+  auto input = cudf::test::strings_column_wrapper({"Héllo", "thesé", "tést strings", ""});
+  auto view  = cudf::strings_column_view(input);
+
+  auto const repl    = cudf::string_scalar(" ", true, cudf::test::get_default_stream());
+  auto const pattern = std::string("[a-z]");
+  auto const prog    = cudf::strings::regex_program::create(pattern);
+  cudf::strings::replace_re(view, *prog, repl, 1, cudf::test::get_default_stream());
+
+  cudf::test::strings_column_wrapper repls({"1", "a", " "});
+  cudf::strings::replace_re(view,
+                            {pattern, pattern, pattern},
+                            cudf::strings_column_view(repls),
+                            cudf::strings::regex_flags::DEFAULT,
+                            cudf::test::get_default_stream());
+}
+
+TEST_F(StringsReplaceTest, ReplaceRegexBackref)
+{
+  auto input = cudf::test::strings_column_wrapper({"Héllo thesé", "tést strings"});
+  auto view  = cudf::strings_column_view(input);
+
+  auto const repl_template = std::string("\\2-\\1");
+  auto const pattern       = std::string("(\\w) (\\w)");
+  auto const prog          = cudf::strings::regex_program::create(pattern);
+  cudf::strings::replace_with_backrefs(
+    view, *prog, repl_template, cudf::test::get_default_stream());
+}
diff --git a/cpp/tests/streams/strings/reverse_test.cpp b/cpp/tests/streams/strings/reverse_test.cpp
new file mode 100644
index 00000000000..83dcf24594e
--- /dev/null
+++ b/cpp/tests/streams/strings/reverse_test.cpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/strings/reverse.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+
+#include <string>
+#include <vector>
+
+class StringsReverseTest : public cudf::test::BaseFixture {};
+
+TEST_F(StringsReverseTest, Reverse)
+{
+  auto input = cudf::test::strings_column_wrapper({"aBcdef", "   ", "12345"});
+  auto view  = cudf::strings_column_view(input);
+
+  cudf::strings::reverse(view, cudf::test::get_default_stream());
+}
diff --git a/cpp/tests/streams/strings/split_test.cpp b/cpp/tests/streams/strings/split_test.cpp
new file mode 100644
index 00000000000..24247f6f79c
--- /dev/null
+++ b/cpp/tests/streams/strings/split_test.cpp
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+
+#include <cudf/strings/regex/regex_program.hpp>
+#include <cudf/strings/split/partition.hpp>
+#include <cudf/strings/split/split.hpp>
+#include <cudf/strings/split/split_re.hpp>
+
+#include <string>
+
+class StringsSplitTest : public cudf::test::BaseFixture {};
+
+TEST_F(StringsSplitTest, SplitPartition)
+{
+  auto input = cudf::test::strings_column_wrapper({"Héllo thesé", "tést strings", ""});
+  auto view  = cudf::strings_column_view(input);
+
+  auto const delimiter = cudf::string_scalar("é", true, cudf::test::get_default_stream());
+  cudf::strings::split(view, delimiter, -1, cudf::test::get_default_stream());
+  cudf::strings::rsplit(view, delimiter, -1, cudf::test::get_default_stream());
+  cudf::strings::split_record(view, delimiter, -1, cudf::test::get_default_stream());
+  cudf::strings::rsplit_record(view, delimiter, -1, cudf::test::get_default_stream());
+  cudf::strings::partition(view, delimiter, cudf::test::get_default_stream());
+  cudf::strings::rpartition(view, delimiter, cudf::test::get_default_stream());
+
+  auto const pattern = std::string("\\s");
+  auto const prog    = cudf::strings::regex_program::create(pattern);
+  cudf::strings::split_re(view, *prog, -1, cudf::test::get_default_stream());
+  cudf::strings::split_record_re(view, *prog, -1, cudf::test::get_default_stream());
+  cudf::strings::rsplit_re(view, *prog, -1, cudf::test::get_default_stream());
+  cudf::strings::rsplit_record_re(view, *prog, -1, cudf::test::get_default_stream());
+}
diff --git a/cpp/tests/streams/strings/strings_tests.cpp b/cpp/tests/streams/strings/strings_tests.cpp
new file mode 100644
index 00000000000..0db467a6895
--- /dev/null
+++ b/cpp/tests/streams/strings/strings_tests.cpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/strings/padding.hpp>
+#include <cudf/strings/slice.hpp>
+#include <cudf/strings/strip.hpp>
+#include <cudf/strings/wrap.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+
+#include <string>
+
+class StringsTest : public cudf::test::BaseFixture {};
+
+TEST_F(StringsTest, Strip)
+{
+  auto input = cudf::test::strings_column_wrapper({"  aBc  ", "   ", "aaaa ", "\tb"});
+  auto view  = cudf::strings_column_view(input);
+
+  auto const strip = cudf::string_scalar(" ", true, cudf::test::get_default_stream());
+  auto const side  = cudf::strings::side_type::BOTH;
+  cudf::strings::strip(view, side, strip, cudf::test::get_default_stream());
+}
+
+TEST_F(StringsTest, Pad)
+{
+  auto input = cudf::test::strings_column_wrapper({"333", "", "4444", "1"});
+  auto view  = cudf::strings_column_view(input);
+
+  auto const side = cudf::strings::side_type::BOTH;
+  cudf::strings::pad(view, 6, side, " ", cudf::test::get_default_stream());
+  cudf::strings::zfill(view, 6, cudf::test::get_default_stream());
+}
+
+TEST_F(StringsTest, Wrap)
+{
+  auto input = cudf::test::strings_column_wrapper({"the quick brown fox jumped"});
+  auto view  = cudf::strings_column_view(input);
+
+  cudf::strings::wrap(view, 6, cudf::test::get_default_stream());
+}
+
+TEST_F(StringsTest, Slice)
+{
+  auto input = cudf::test::strings_column_wrapper({"hello", "these", "are test strings"});
+  auto view  = cudf::strings_column_view(input);
+
+  auto start = cudf::numeric_scalar(2, true, cudf::test::get_default_stream());
+  auto stop  = cudf::numeric_scalar(5, true, cudf::test::get_default_stream());
+  auto step  = cudf::numeric_scalar(1, true, cudf::test::get_default_stream());
+  cudf::strings::slice_strings(view, start, stop, step, cudf::test::get_default_stream());
+
+  auto starts = cudf::test::fixed_width_column_wrapper<cudf::size_type>({1, 2, 3});
+  auto stops  = cudf::test::fixed_width_column_wrapper<cudf::size_type>({4, 5, 6});
+  cudf::strings::slice_strings(view, starts, stops, cudf::test::get_default_stream());
+}
diff --git a/cpp/tests/streams/text/replace_test.cpp b/cpp/tests/streams/text/replace_test.cpp
new file mode 100644
index 00000000000..7617f886f9d
--- /dev/null
+++ b/cpp/tests/streams/text/replace_test.cpp
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+
+#include <nvtext/normalize.hpp>
+#include <nvtext/replace.hpp>
+
+class TextReplaceTest : public cudf::test::BaseFixture {};
+
+TEST_F(TextReplaceTest, Replace)
+{
+  auto const input     = cudf::test::strings_column_wrapper({"the fox jumped over the dog"});
+  auto const targets   = cudf::test::strings_column_wrapper({"the", "dog"});
+  auto const repls     = cudf::test::strings_column_wrapper({"_", ""});
+  auto const delimiter = cudf::string_scalar{" ", true, cudf::test::get_default_stream()};
+  nvtext::replace_tokens(cudf::strings_column_view(input),
+                         cudf::strings_column_view(targets),
+                         cudf::strings_column_view(repls),
+                         delimiter,
+                         cudf::test::get_default_stream());
+}
+
+TEST_F(TextReplaceTest, Filter)
+{
+  auto const input     = cudf::test::strings_column_wrapper({"one two three", "four five six"});
+  auto const delimiter = cudf::string_scalar{" ", true, cudf::test::get_default_stream()};
+  auto const repl      = cudf::string_scalar{"_", true, cudf::test::get_default_stream()};
+  nvtext::filter_tokens(
+    cudf::strings_column_view(input), 1, delimiter, repl, cudf::test::get_default_stream());
+}
+
+TEST_F(TextReplaceTest, NormalizeSpaces)
+{
+  auto input =
+    cudf::test::strings_column_wrapper({"the\tquick brown\nfox", "jumped\rover the lazy\r\t\n"});
+  nvtext::normalize_spaces(cudf::strings_column_view(input), cudf::test::get_default_stream());
+}
+
+TEST_F(TextReplaceTest, NormalizeCharacters)
+{
+  auto input = cudf::test::strings_column_wrapper({"abc£def", "éè â îô\taeio", "\tĂĆĖÑ  Ü"});
+  nvtext::normalize_characters(
+    cudf::strings_column_view(input), false, cudf::test::get_default_stream());
+}
diff --git a/cpp/tests/streams/text/tokenize_test.cpp b/cpp/tests/streams/text/tokenize_test.cpp
new file mode 100644
index 00000000000..b281fbc2c0c
--- /dev/null
+++ b/cpp/tests/streams/text/tokenize_test.cpp
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <nvtext/tokenize.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+
+class TextTokenizeTest : public cudf::test::BaseFixture {};
+
+TEST_F(TextTokenizeTest, Tokenize)
+{
+  auto const input     = cudf::test::strings_column_wrapper({"the fox jumped", "over thé dog"});
+  auto const view      = cudf::strings_column_view(input);
+  auto const delimiter = cudf::string_scalar{" ", true, cudf::test::get_default_stream()};
+  nvtext::tokenize(view, delimiter, cudf::test::get_default_stream());
+  nvtext::count_tokens(view, delimiter, cudf::test::get_default_stream());
+  auto const delimiters = cudf::test::strings_column_wrapper({" ", "o", "é"});
+  nvtext::tokenize(view, cudf::strings_column_view(delimiters), cudf::test::get_default_stream());
+  nvtext::count_tokens(
+    view, cudf::strings_column_view(delimiters), cudf::test::get_default_stream());
+}
+
+TEST_F(TextTokenizeTest, CharacterTokenize)
+{
+  auto const input =
+    cudf::test::strings_column_wrapper({"the", "fox", "jumped", "over", "thé", "dog"});
+  nvtext::character_tokenize(cudf::strings_column_view(input), cudf::test::get_default_stream());
+}
+
+TEST_F(TextTokenizeTest, Detokenize)
+{
+  auto const input =
+    cudf::test::strings_column_wrapper({"the", "fox", "jumped", "over", "thé", "dog"});
+  auto const view      = cudf::strings_column_view(input);
+  auto const indices   = cudf::test::fixed_width_column_wrapper<int32_t>({0, 0, 0, 1, 1, 1});
+  auto const separator = cudf::string_scalar{" ", true, cudf::test::get_default_stream()};
+  nvtext::detokenize(view, indices, separator, cudf::test::get_default_stream());
+}
diff --git a/cpp/tests/streams/unary_test.cpp b/cpp/tests/streams/unary_test.cpp
new file mode 100644
index 00000000000..1734c0c4e9f
--- /dev/null
+++ b/cpp/tests/streams/unary_test.cpp
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/unary.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+
+class UnaryTest : public cudf::test::BaseFixture {};
+
+TEST_F(UnaryTest, UnaryOperation)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> const column{10, 20, 30, 40, 50};
+
+  cudf::unary_operation(column, cudf::unary_operator::ABS, cudf::test::get_default_stream());
+}
+
+TEST_F(UnaryTest, IsNull)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> const column{10, 20, 30, 40, 50};
+
+  cudf::is_null(column, cudf::test::get_default_stream());
+}
+
+TEST_F(UnaryTest, IsValid)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> const column{10, 20, 30, 40, 50};
+
+  cudf::is_valid(column, cudf::test::get_default_stream());
+}
+
+TEST_F(UnaryTest, Cast)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> const column{10, 20, 30, 40, 50};
+
+  cudf::cast(column, cudf::data_type{cudf::type_id::INT64}, cudf::test::get_default_stream());
+}
+
+TEST_F(UnaryTest, IsNan)
+{
+  cudf::test::fixed_width_column_wrapper<float> const column{10, 20, 30, 40, 50};
+
+  cudf::is_nan(column, cudf::test::get_default_stream());
+}
+
+TEST_F(UnaryTest, IsNotNan)
+{
+  cudf::test::fixed_width_column_wrapper<float> const column{10, 20, 30, 40, 50};
+
+  cudf::is_not_nan(column, cudf::test::get_default_stream());
+}
diff --git a/cpp/tests/strings/booleans_tests.cpp b/cpp/tests/strings/booleans_tests.cpp
index 0c7fc992065..469ca77a4c5 100644
--- a/cpp/tests/strings/booleans_tests.cpp
+++ b/cpp/tests/strings/booleans_tests.cpp
@@ -36,7 +36,8 @@ TEST_F(StringsConvertTest, ToBooleans)
     thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
 
   auto strings_view = cudf::strings_column_view(strings);
-  auto results      = cudf::strings::to_booleans(strings_view);
+  auto true_scalar  = cudf::string_scalar("true");
+  auto results      = cudf::strings::to_booleans(strings_view, true_scalar);
 
   std::vector<bool> h_expected{false, false, false, true, false, false};
   cudf::test::fixed_width_column_wrapper<bool> expected(
@@ -60,26 +61,46 @@ TEST_F(StringsConvertTest, FromBooleans)
     h_column.end(),
     thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
 
-  auto results = cudf::strings::from_booleans(column);
+  auto true_scalar  = cudf::string_scalar("true");
+  auto false_scalar = cudf::string_scalar("false");
+  auto results      = cudf::strings::from_booleans(column, true_scalar, false_scalar);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, strings);
 }
 
 TEST_F(StringsConvertTest, ZeroSizeStringsColumnBoolean)
 {
   auto const zero_size_column = cudf::make_empty_column(cudf::type_id::BOOL8)->view();
-  auto results                = cudf::strings::from_booleans(zero_size_column);
+  auto true_scalar            = cudf::string_scalar("true");
+  auto false_scalar           = cudf::string_scalar("false");
+  auto results = cudf::strings::from_booleans(zero_size_column, true_scalar, false_scalar);
   cudf::test::expect_column_empty(results->view());
 }
 
 TEST_F(StringsConvertTest, ZeroSizeBooleansColumn)
 {
   auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING)->view();
-  auto results                        = cudf::strings::to_booleans(zero_size_strings_column);
+  auto true_scalar                    = cudf::string_scalar("true");
+  auto results = cudf::strings::to_booleans(zero_size_strings_column, true_scalar);
   EXPECT_EQ(0, results->size());
 }
 
 TEST_F(StringsConvertTest, BooleanError)
 {
-  auto column = cudf::make_numeric_column(cudf::data_type{cudf::type_id::INT32}, 100);
-  EXPECT_THROW(cudf::strings::from_booleans(column->view()), cudf::logic_error);
+  auto int_column   = cudf::test::fixed_width_column_wrapper<int32_t>({1, 2, 3});
+  auto true_scalar  = cudf::string_scalar("true");
+  auto false_scalar = cudf::string_scalar("false");
+  EXPECT_THROW(cudf::strings::from_booleans(int_column, true_scalar, false_scalar),
+               cudf::logic_error);
+
+  auto bool_column = cudf::test::fixed_width_column_wrapper<bool>({1, 0, 1});
+  auto null_scalar = cudf::string_scalar("", false);
+  EXPECT_THROW(cudf::strings::from_booleans(bool_column, null_scalar, false_scalar),
+               cudf::logic_error);
+  EXPECT_THROW(cudf::strings::from_booleans(bool_column, true_scalar, null_scalar),
+               cudf::logic_error);
+  auto empty_scalar = cudf::string_scalar("", true);
+  EXPECT_THROW(cudf::strings::from_booleans(int_column, empty_scalar, false_scalar),
+               cudf::logic_error);
+  EXPECT_THROW(cudf::strings::from_booleans(int_column, true_scalar, empty_scalar),
+               cudf::logic_error);
 }
diff --git a/cpp/tests/strings/format_lists_tests.cpp b/cpp/tests/strings/format_lists_tests.cpp
index 95dc9725afc..6196b8ed6ad 100644
--- a/cpp/tests/strings/format_lists_tests.cpp
+++ b/cpp/tests/strings/format_lists_tests.cpp
@@ -60,8 +60,9 @@ TEST_F(StringsFormatListsTest, WithNulls)
                                cudf::test::iterators::null_at(1)};
   auto const view  = cudf::lists_column_view(input);
 
-  auto results  = cudf::strings::format_list_column(view);
-  auto expected = cudf::test::strings_column_wrapper(
+  auto null_scalar = cudf::string_scalar("NULL");
+  auto results     = cudf::strings::format_list_column(view, null_scalar);
+  auto expected    = cudf::test::strings_column_wrapper(
     {"[a,NULL,ccc]", "NULL", "[NULL,bb,ddd]", "[zzz,xxxxx]", "[v,,NULL,w]"});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
@@ -132,11 +133,13 @@ TEST_F(StringsFormatListsTest, SlicedLists)
                                                     "[ééé,12345abcdef]",
                                                     "[www,12345]"});
 
+  auto null_scalar = cudf::string_scalar("NULL");
+
   // set of slice intervals: covers slicing the front, back, and middle
   std::vector<std::pair<int32_t, int32_t>> index_pairs({{0, 11}, {0, 4}, {3, 8}, {5, 11}});
   for (auto indexes : index_pairs) {
     auto sliced   = cudf::lists_column_view(cudf::slice(input, {indexes.first, indexes.second})[0]);
-    auto results  = cudf::strings::format_list_column(sliced);
+    auto results  = cudf::strings::format_list_column(sliced, null_scalar);
     auto expected = cudf::test::strings_column_wrapper(h_expected.begin() + indexes.first,
                                                        h_expected.begin() + indexes.second);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
diff --git a/cpp/tests/strings/integers_tests.cpp b/cpp/tests/strings/integers_tests.cpp
index 59805f9cb6d..c8f292f55b2 100644
--- a/cpp/tests/strings/integers_tests.cpp
+++ b/cpp/tests/strings/integers_tests.cpp
@@ -456,3 +456,29 @@ TEST_F(StringsConvertTest, IntegerToHexWithNull)
   auto results = cudf::strings::integers_to_hex(integers);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
+
+TEST_F(StringsConvertTest, IntegerConvertErrors)
+{
+  cudf::test::fixed_width_column_wrapper<bool> bools(
+    {true, true, false, false, true, true, false, true});
+  cudf::test::fixed_width_column_wrapper<double> floats(
+    {123456.0, -1.0, 0.0, 0.0, 12.0, 12345.0, 123456789.0});
+  EXPECT_THROW(cudf::strings::integers_to_hex(bools), cudf::logic_error);
+  EXPECT_THROW(cudf::strings::integers_to_hex(floats), cudf::logic_error);
+  EXPECT_THROW(cudf::strings::from_integers(bools), cudf::logic_error);
+  EXPECT_THROW(cudf::strings::from_integers(floats), cudf::logic_error);
+
+  auto input = cudf::test::strings_column_wrapper({"123456", "-1", "0"});
+  auto view  = cudf::strings_column_view(input);
+  EXPECT_THROW(cudf::strings::to_integers(view, cudf::data_type(cudf::type_id::BOOL8)),
+               cudf::logic_error);
+  EXPECT_THROW(cudf::strings::to_integers(view, cudf::data_type(cudf::type_id::FLOAT32)),
+               cudf::logic_error);
+  EXPECT_THROW(cudf::strings::to_integers(view, cudf::data_type(cudf::type_id::TIMESTAMP_SECONDS)),
+               cudf::logic_error);
+  EXPECT_THROW(
+    cudf::strings::to_integers(view, cudf::data_type(cudf::type_id::DURATION_MILLISECONDS)),
+    cudf::logic_error);
+  EXPECT_THROW(cudf::strings::to_integers(view, cudf::data_type(cudf::type_id::DECIMAL32)),
+               cudf::logic_error);
+}
diff --git a/cpp/tests/strings/replace_tests.cpp b/cpp/tests/strings/replace_tests.cpp
index f143983aded..f04bb832f09 100644
--- a/cpp/tests/strings/replace_tests.cpp
+++ b/cpp/tests/strings/replace_tests.cpp
@@ -246,6 +246,28 @@ TEST_F(StringsReplaceTest, ReplaceEndOfString)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 
+TEST_F(StringsReplaceTest, ReplaceAdjacentMultiByteTarget)
+{
+  auto input = cudf::test::strings_column_wrapper({"ééééééé", "eéeéeée", "eeeeeee"});
+  auto strings_view = cudf::strings_column_view(input);
+  // replace all occurrences of 'é' with 'e'
+  cudf::test::strings_column_wrapper expected({"eeeeeee", "eeeeeee", "eeeeeee"});
+
+  auto stream = cudf::get_default_stream();
+  auto mr     = rmm::mr::get_current_device_resource();
+
+  auto target  = cudf::string_scalar("é", true, stream);
+  auto repl    = cudf::string_scalar("e", true, stream);
+  auto results = cudf::strings::replace(strings_view, target, repl);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  results = cudf::strings::detail::replace<algorithm::CHAR_PARALLEL>(
+    strings_view, target, repl, -1, stream, mr);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  results = cudf::strings::detail::replace<algorithm::ROW_PARALLEL>(
+    strings_view, target, repl, -1, stream, mr);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+}
+
 TEST_F(StringsReplaceTest, ReplaceSlice)
 {
   std::vector<char const*> h_strings{"Héllo", "thesé", nullptr, "ARE THE", "tést strings", ""};
diff --git a/cpp/tests/text/bpe_tests.cpp b/cpp/tests/text/bpe_tests.cpp
index 234d8c4fecc..a13b61e0ba4 100644
--- a/cpp/tests/text/bpe_tests.cpp
+++ b/cpp/tests/text/bpe_tests.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include <nvtext/bpe_tokenize.hpp>
+#include <nvtext/byte_pair_encoding.hpp>
 
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
@@ -24,44 +24,41 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
-struct TextBPETokenize : public cudf::test::BaseFixture {};
+struct TextBytePairEncoding : public cudf::test::BaseFixture {};
 
-TEST_F(TextBPETokenize, BytePairEncoding)
+TEST_F(TextBytePairEncoding, BytePairEncoding)
 {
   // partial table based on values from https://huggingface.co/gpt2/raw/main/merges.txt
   auto mpt = cudf::test::strings_column_wrapper({
-    "e n",    // 12
-    "i t",    // 14
-    "i s",    // 15
-    "e s",    // 18
-    "en t",   // 42
-    "c e",    // 88
-    "es t",   // 139
-    "en ce",  // 338
-    "T h",    // 561
-    "Th is",  // 956
-    "t est",  // 9032
-    "s ent",  // 33830
+    "e n",    // 14
+    "i t",    // 16
+    "i s",    // 17
+    "e s",    // 20
+    "en t",   // 44
+    "c e",    // 90
+    "es t",   // 141
+    "en ce",  // 340
+    "t h",    // 146
+    "h i",    // 5049
+    "th is",  // 5407
+    "t est",  // 9034
+    "s i",    // 13142
+    "s ent"   // 33832
   });
 
-  nvtext::bpe_merge_pairs merge_pairs{cudf::strings_column_view(mpt)};
+  auto merge_pairs = nvtext::load_merge_pairs(cudf::strings_column_view(mpt));
 
   auto validity = cudf::test::iterators::null_at(4);
-  cudf::test::strings_column_wrapper input({" This\tis  it\n",
-                                            "This is test-sentence-1",
-                                            "This is test sentence-2",
-                                            "This-is test sentence 3",
-                                            "",
-                                            ""},
-                                           validity);
+  cudf::test::strings_column_wrapper input(
+    {"thisisit", "thisis test-sentence-1", "thisistestsentence-2", "this-istestsentence 3", "", ""},
+    validity);
   auto sv = cudf::strings_column_view(input);
 
-  auto results = nvtext::byte_pair_encoding(sv, merge_pairs);
-
-  auto expected = cudf::test::strings_column_wrapper({" This is it",
-                                                      "This is test - sent ence - 1",
-                                                      "This is test sent ence - 2",
-                                                      "This - is test sent ence 3",
+  auto results  = nvtext::byte_pair_encoding(sv, *merge_pairs);
+  auto expected = cudf::test::strings_column_wrapper({"this is it",
+                                                      "this is   test - sent ence - 1",
+                                                      "this is test sent ence - 2",
+                                                      "this - is test sent ence   3",
                                                       "",
                                                       ""},
                                                      validity);
@@ -70,41 +67,68 @@ TEST_F(TextBPETokenize, BytePairEncoding)
   auto sliced          = cudf::slice(input, {1, 4}).front();
   auto sliced_expected = cudf::slice(expected, {1, 4}).front();
 
-  results = nvtext::byte_pair_encoding(cudf::strings_column_view(sliced), merge_pairs);
+  sv      = cudf::strings_column_view(sliced);
+  results = nvtext::byte_pair_encoding(sv, *merge_pairs);
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), sliced_expected);
 }
 
-TEST_F(TextBPETokenize, BytePairEncodingSeparator)
+TEST_F(TextBytePairEncoding, BytePairEncodingSeparator)
 {
   auto mpt = cudf::test::strings_column_wrapper(
-    {"e n", "i t", "e s", "en t", "c e", "es t", "en ce", "t est", "s ent"});
-  nvtext::bpe_merge_pairs merge_pairs{cudf::strings_column_view(mpt)};
+    {"Ġ t", "Ġt he", "h e", "e n", "i t", "e s", "en t", "c e", "es t", "en ce", "t est", "s ent"});
+
+  auto merge_pairs = nvtext::load_merge_pairs(cudf::strings_column_view(mpt));
 
   cudf::test::strings_column_wrapper input(
-    {"test-sentence-1", "test sentence-2", "test sentence 3", " test sentence 4 "});
+    {"Ġthe test sentence", "test Ġthe sentence", "Ġthetest sentence", "testĠthesentence"});
   auto sv = cudf::strings_column_view(input);
 
-  auto results = nvtext::byte_pair_encoding(sv, merge_pairs, std::string(" Ġ"));
+  auto results = nvtext::byte_pair_encoding(sv, *merge_pairs, std::string("$"));
+
+  auto expected = cudf::test::strings_column_wrapper({"Ġthe$ $test$ $sent$ence",
+                                                      "test$ $Ġthe$ $sent$ence",
+                                                      "Ġthe$test$ $sent$ence",
+                                                      "test$Ġthe$sent$ence"});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected);
+}
+
+TEST_F(TextBytePairEncoding, BPEAdjacentPairs)
+{
+  auto mpt         = cudf::test::strings_column_wrapper({
+    "▁ H",    //    157
+    "m m",    //  10742
+    "? !",    //  50675
+    "▁H mm",  // 174381
+    "mm m",   // 262776
+    "?! !",   // 352313
+    "? !?",   // 352314
+    "mm mm",  // 387733
+    "▁H m",   // 471269
+    "?! ?!",  // 506981
+    "?!? !",  // 506982
+  });
+  auto merge_pairs = nvtext::load_merge_pairs(cudf::strings_column_view(mpt));
+
+  cudf::test::strings_column_wrapper input({"▁Hmmmmm", "?!?!?!"});
 
-  auto expected = cudf::test::strings_column_wrapper(
-    {"test - sent ence - 1", "test Ġsent ence - 2", "test Ġsent ence Ġ3", " Ġtest Ġsent ence Ġ4"});
+  auto results  = nvtext::byte_pair_encoding(cudf::strings_column_view(input), *merge_pairs);
+  auto expected = cudf::test::strings_column_wrapper({"▁Hmm mmm", "?!?! ?!"});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected);
 }
 
-TEST_F(TextBPETokenize, BPE_Empty)
+TEST_F(TextBytePairEncoding, BPE_Empty)
 {
-  auto mpt = cudf::test::strings_column_wrapper({"i s", "i t"});
-  nvtext::bpe_merge_pairs merge_pairs{mpt.release()};
-  auto empty   = cudf::make_empty_column(cudf::type_id::STRING);
-  auto results = nvtext::byte_pair_encoding(cudf::strings_column_view(empty->view()), merge_pairs);
+  auto mpt         = cudf::test::strings_column_wrapper({"i s", "i t"});
+  auto merge_pairs = nvtext::load_merge_pairs(cudf::strings_column_view(mpt));
+  auto empty       = cudf::make_empty_column(cudf::type_id::STRING);
+  auto results = nvtext::byte_pair_encoding(cudf::strings_column_view(empty->view()), *merge_pairs);
   EXPECT_EQ(0, results->size());
 }
 
-TEST_F(TextBPETokenize, BPE_Error)
+TEST_F(TextBytePairEncoding, BPE_Error)
 {
   auto empty = cudf::make_empty_column(cudf::type_id::STRING);
-  nvtext::bpe_merge_pairs merge_pairs{std::move(empty)};
-  cudf::test::strings_column_wrapper input({"isit"});
-  EXPECT_THROW(nvtext::byte_pair_encoding(cudf::strings_column_view(input), merge_pairs),
-               cudf::logic_error);
+  EXPECT_THROW(nvtext::load_merge_pairs(cudf::strings_column_view(*empty)), cudf::logic_error);
+  auto null_pairs = cudf::test::strings_column_wrapper({"", ""}, {1, 0});
+  EXPECT_THROW(nvtext::load_merge_pairs(cudf::strings_column_view(null_pairs)), cudf::logic_error);
 }
diff --git a/cpp/tests/text/tokenize_tests.cpp b/cpp/tests/text/tokenize_tests.cpp
index d78f2dfbdf3..ea36e13de6f 100644
--- a/cpp/tests/text/tokenize_tests.cpp
+++ b/cpp/tests/text/tokenize_tests.cpp
@@ -208,14 +208,16 @@ TEST_F(TextTokenizeTest, Vocabulary)
     {"ate", "chased", "cheese", "dog", "fox", "jumped", "mouse", "mousé", "over", "the"});
   auto vocab = nvtext::load_vocabulary(cudf::strings_column_view(vocabulary));
 
-  auto validity = cudf::test::iterators::null_at(1);
-  cudf::test::strings_column_wrapper input({"the fox jumped over the dog",
-                                            "the dog chased the cat",
-                                            "the cat chased the mouse",
-                                            "the mousé  ate  cheese",
-                                            "",
-                                            ""},
-                                           validity);
+  auto validity = cudf::test::iterators::null_at(5);
+  auto input    = cudf::test::strings_column_wrapper({" the fox jumped over the dog ",
+                                                      " the dog chased the cat",
+                                                      "",
+                                                      "the cat chased the mouse ",
+                                                      "the mousé  ate  cheese",
+                                                      "",
+                                                      "dog"},
+                                                  validity);
+
   auto input_view = cudf::strings_column_view(input);
   auto delimiter  = cudf::string_scalar(" ");
   auto default_id = -7;  // should be the token for the missing 'cat'
@@ -225,12 +227,55 @@ TEST_F(TextTokenizeTest, Vocabulary)
   // clang-format off
   LCW expected({LCW{ 9, 4, 5, 8, 9, 3},
                 LCW{ 9, 3, 1, 9,-7},
+                LCW{},
                 LCW{ 9,-7, 1, 9, 6},
                 LCW{ 9, 7, 0, 2},
-                LCW{}, LCW{}},
+                LCW{}, LCW{3}},
                 validity);
   // clang-format on
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
+  auto sliced          = cudf::slice(input, {1, 4}).front();
+  auto sliced_expected = cudf::slice(expected, {1, 4}).front();
+
+  input_view = cudf::strings_column_view(sliced);
+
+  results = nvtext::tokenize_with_vocabulary(input_view, *vocab, delimiter, default_id);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), sliced_expected);
+}
+
+TEST_F(TextTokenizeTest, VocabularyLongStrings)
+{
+  cudf::test::strings_column_wrapper vocabulary(
+    {"ate", "chased", "cheese", "dog", "fox", "jumped", "mouse", "mousé", "over", "the"});
+  auto vocab = nvtext::load_vocabulary(cudf::strings_column_view(vocabulary));
+
+  std::vector<std::string> h_strings(
+    4,
+    "the fox jumped chased the dog cheese mouse at the over there dog mouse cat plus the horse "
+    "jumped  over  the mousé  house with the dog  ");
+  cudf::test::strings_column_wrapper input(h_strings.begin(), h_strings.end());
+  auto input_view = cudf::strings_column_view(input);
+  auto delimiter  = cudf::string_scalar(" ");
+  auto default_id = -1;
+  auto results    = nvtext::tokenize_with_vocabulary(input_view, *vocab, delimiter, default_id);
+
+  using LCW = cudf::test::lists_column_wrapper<cudf::size_type>;
+  // clang-format off
+  LCW expected({LCW{ 9, 4, 5, 1, 9, 3, 2, 6, -1, 9, 8, -1, 3, 6, -1, -1, 9, -1, 5, 8, 9, 7, -1, -1, 9, 3},
+                LCW{ 9, 4, 5, 1, 9, 3, 2, 6, -1, 9, 8, -1, 3, 6, -1, -1, 9, -1, 5, 8, 9, 7, -1, -1, 9, 3},
+                LCW{ 9, 4, 5, 1, 9, 3, 2, 6, -1, 9, 8, -1, 3, 6, -1, -1, 9, -1, 5, 8, 9, 7, -1, -1, 9, 3},
+                LCW{ 9, 4, 5, 1, 9, 3, 2, 6, -1, 9, 8, -1, 3, 6, -1, -1, 9, -1, 5, 8, 9, 7, -1, -1, 9, 3}});
+  // clang-format on
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
+  auto sliced          = cudf::slice(input, {1, 3}).front();
+  auto sliced_expected = cudf::slice(expected, {1, 3}).front();
+
+  input_view = cudf::strings_column_view(sliced);
+
+  results = nvtext::tokenize_with_vocabulary(input_view, *vocab, delimiter, default_id);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), sliced_expected);
 }
 
 TEST_F(TextTokenizeTest, TokenizeErrors)
diff --git a/cpp/tests/utilities/column_utilities.cu b/cpp/tests/utilities/column_utilities.cu
index 620e0bfe8de..f54ea28d9b2 100644
--- a/cpp/tests/utilities/column_utilities.cu
+++ b/cpp/tests/utilities/column_utilities.cu
@@ -14,28 +14,24 @@
  * limitations under the License.
  */
 
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/cudf_gtest.hpp>
+#include <cudf_test/debug_utilities.hpp>
+#include <cudf_test/default_stream.hpp>
+
 #include <cudf/column/column_view.hpp>
 #include <cudf/copying.hpp>
 #include <cudf/detail/get_value.cuh>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
-#include <cudf/dictionary/dictionary_column_view.hpp>
-#include <cudf/lists/lists_column_view.hpp>
-#include <cudf/strings/convert/convert_datetime.hpp>
+#include <cudf/lists/list_view.hpp>
 #include <cudf/structs/struct_view.hpp>
-#include <cudf/structs/structs_column_view.hpp>
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/table_device_view.cuh>
-#include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
-#include <cudf_test/column_utilities.hpp>
-#include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
-#include <cudf_test/default_stream.hpp>
-#include <cudf_test/detail/column_utilities.hpp>
-
 #include <rmm/exec_policy.hpp>
 
 #include <thrust/copy.h>
@@ -928,396 +924,6 @@ std::vector<bitmask_type> bitmask_to_host(cudf::column_view const& c)
   }
 }
 
-namespace {
-
-template <typename T, std::enable_if_t<std::is_integral_v<T>>* = nullptr>
-static auto numeric_to_string_precise(T value)
-{
-  return std::to_string(value);
-}
-
-template <typename T, std::enable_if_t<std::is_floating_point_v<T>>* = nullptr>
-static auto numeric_to_string_precise(T value)
-{
-  std::ostringstream o;
-  o << std::setprecision(std::numeric_limits<T>::max_digits10) << value;
-  return o.str();
-}
-
-static auto duration_suffix(cudf::duration_D) { return " days"; }
-
-static auto duration_suffix(cudf::duration_s) { return " seconds"; }
-
-static auto duration_suffix(cudf::duration_ms) { return " milliseconds"; }
-
-static auto duration_suffix(cudf::duration_us) { return " microseconds"; }
-
-static auto duration_suffix(cudf::duration_ns) { return " nanoseconds"; }
-
-std::string get_nested_type_str(cudf::column_view const& view)
-{
-  if (view.type().id() == cudf::type_id::LIST) {
-    lists_column_view lcv(view);
-    return cudf::type_to_name(view.type()) + "<" + (get_nested_type_str(lcv.child())) + ">";
-  }
-
-  if (view.type().id() == cudf::type_id::STRUCT) {
-    std::ostringstream out;
-
-    out << cudf::type_to_name(view.type()) + "<";
-    std::transform(view.child_begin(),
-                   view.child_end(),
-                   std::ostream_iterator<std::string>(out, ","),
-                   [&out](auto const col) { return get_nested_type_str(col); });
-    out << ">";
-    return out.str();
-  }
-
-  return cudf::type_to_name(view.type());
-}
-
-template <typename NestedColumnView>
-std::string nested_offsets_to_string(NestedColumnView const& c, std::string const& delimiter = ", ")
-{
-  column_view offsets = (c.parent()).child(NestedColumnView::offsets_column_index);
-  CUDF_EXPECTS(offsets.type().id() == type_id::INT32,
-               "Column does not appear to be an offsets column");
-  CUDF_EXPECTS(offsets.offset() == 0, "Offsets column has an internal offset!");
-  size_type output_size = c.size() + 1;
-
-  // the first offset value to normalize everything against
-  size_type first =
-    cudf::detail::get_value<size_type>(offsets, c.offset(), cudf::test::get_default_stream());
-  rmm::device_uvector<size_type> shifted_offsets(output_size, cudf::test::get_default_stream());
-
-  // normalize the offset values for the column offset
-  size_type const* d_offsets = offsets.head<size_type>() + c.offset();
-  thrust::transform(
-    rmm::exec_policy(cudf::test::get_default_stream()),
-    d_offsets,
-    d_offsets + output_size,
-    shifted_offsets.begin(),
-    [first] __device__(int32_t offset) { return static_cast<size_type>(offset - first); });
-
-  auto const h_shifted_offsets =
-    cudf::detail::make_host_vector_sync(shifted_offsets, cudf::test::get_default_stream());
-  std::ostringstream buffer;
-  for (size_t idx = 0; idx < h_shifted_offsets.size(); idx++) {
-    buffer << h_shifted_offsets[idx];
-    if (idx < h_shifted_offsets.size() - 1) { buffer << delimiter; }
-  }
-  return buffer.str();
-}
-
-struct column_view_printer {
-  template <typename Element, std::enable_if_t<is_numeric<Element>()>* = nullptr>
-  void operator()(cudf::column_view const& col, std::vector<std::string>& out, std::string const&)
-  {
-    auto h_data = cudf::test::to_host<Element>(col);
-
-    out.resize(col.size());
-
-    if (col.nullable()) {
-      std::transform(thrust::make_counting_iterator(size_type{0}),
-                     thrust::make_counting_iterator(col.size()),
-                     out.begin(),
-                     [&h_data](auto idx) {
-                       return bit_is_set(h_data.second.data(), idx)
-                                ? numeric_to_string_precise(h_data.first[idx])
-                                : std::string("NULL");
-                     });
-
-    } else {
-      std::transform(h_data.first.begin(), h_data.first.end(), out.begin(), [](Element el) {
-        return numeric_to_string_precise(el);
-      });
-    }
-  }
-
-  template <typename Element, std::enable_if_t<is_timestamp<Element>()>* = nullptr>
-  void operator()(cudf::column_view const& col,
-                  std::vector<std::string>& out,
-                  std::string const& indent)
-  {
-    //  For timestamps, convert timestamp column to column of strings, then
-    //  call string version
-    std::string format = [&]() {
-      if constexpr (std::is_same_v<cudf::timestamp_s, Element>) {
-        return std::string{"%Y-%m-%dT%H:%M:%SZ"};
-      } else if constexpr (std::is_same_v<cudf::timestamp_ms, Element>) {
-        return std::string{"%Y-%m-%dT%H:%M:%S.%3fZ"};
-      } else if constexpr (std::is_same_v<cudf::timestamp_us, Element>) {
-        return std::string{"%Y-%m-%dT%H:%M:%S.%6fZ"};
-      } else if constexpr (std::is_same_v<cudf::timestamp_ns, Element>) {
-        return std::string{"%Y-%m-%dT%H:%M:%S.%9fZ"};
-      }
-      return std::string{"%Y-%m-%d"};
-    }();
-
-    auto col_as_strings = cudf::strings::from_timestamps(col, format);
-    if (col_as_strings->size() == 0) { return; }
-
-    this->template operator()<cudf::string_view>(*col_as_strings, out, indent);
-  }
-
-  template <typename Element, std::enable_if_t<cudf::is_fixed_point<Element>()>* = nullptr>
-  void operator()(cudf::column_view const& col, std::vector<std::string>& out, std::string const&)
-  {
-    auto const h_data = cudf::test::to_host<Element>(col);
-    if (col.nullable()) {
-      std::transform(thrust::make_counting_iterator(size_type{0}),
-                     thrust::make_counting_iterator(col.size()),
-                     std::back_inserter(out),
-                     [&h_data](auto idx) {
-                       return h_data.second.empty() || bit_is_set(h_data.second.data(), idx)
-                                ? static_cast<std::string>(h_data.first[idx])
-                                : std::string("NULL");
-                     });
-    } else {
-      std::transform(std::cbegin(h_data.first),
-                     std::cend(h_data.first),
-                     std::back_inserter(out),
-                     [col](auto const& fp) { return static_cast<std::string>(fp); });
-    }
-  }
-
-  template <typename Element,
-            std::enable_if_t<std::is_same_v<Element, cudf::string_view>>* = nullptr>
-  void operator()(cudf::column_view const& col, std::vector<std::string>& out, std::string const&)
-  {
-    //
-    //  Implementation for strings, call special to_host variant
-    //
-    if (col.is_empty()) return;
-    auto h_data = cudf::test::to_host<std::string>(col);
-
-    // explicitly replace some special whitespace characters with their literal equivalents
-    auto cleaned = [](std::string_view in) {
-      std::string out(in);
-      auto replace_char = [](std::string& out, char c, std::string_view repl) {
-        for (std::string::size_type pos{}; out.npos != (pos = out.find(c, pos)); pos++) {
-          out.replace(pos, 1, repl);
-        }
-      };
-      replace_char(out, '\a', "\\a");
-      replace_char(out, '\b', "\\b");
-      replace_char(out, '\f', "\\f");
-      replace_char(out, '\r', "\\r");
-      replace_char(out, '\t', "\\t");
-      replace_char(out, '\n', "\\n");
-      replace_char(out, '\v', "\\v");
-      return out;
-    };
-
-    out.resize(col.size());
-    std::transform(thrust::make_counting_iterator(size_type{0}),
-                   thrust::make_counting_iterator(col.size()),
-                   out.begin(),
-                   [&](auto idx) {
-                     return h_data.second.empty() || bit_is_set(h_data.second.data(), idx)
-                              ? cleaned(h_data.first[idx])
-                              : std::string("NULL");
-                   });
-  }
-
-  template <typename Element,
-            std::enable_if_t<std::is_same_v<Element, cudf::dictionary32>>* = nullptr>
-  void operator()(cudf::column_view const& col, std::vector<std::string>& out, std::string const&)
-  {
-    cudf::dictionary_column_view dictionary(col);
-    if (col.is_empty()) return;
-    std::vector<std::string> keys    = to_strings(dictionary.keys());
-    std::vector<std::string> indices = to_strings({dictionary.indices().type(),
-                                                   dictionary.size(),
-                                                   dictionary.indices().head(),
-                                                   dictionary.null_mask(),
-                                                   dictionary.null_count(),
-                                                   dictionary.offset()});
-    out.insert(out.end(), keys.begin(), keys.end());
-    if (!indices.empty()) {
-      std::string first = "\x08 : " + indices.front();  // use : as delimiter
-      out.push_back(first);                             // between keys and indices
-      out.insert(out.end(), indices.begin() + 1, indices.end());
-    }
-  }
-
-  // Print the tick counts with the units
-  template <typename Element, std::enable_if_t<is_duration<Element>()>* = nullptr>
-  void operator()(cudf::column_view const& col, std::vector<std::string>& out, std::string const&)
-  {
-    auto h_data = cudf::test::to_host<Element>(col);
-
-    out.resize(col.size());
-
-    if (col.nullable()) {
-      std::transform(thrust::make_counting_iterator(size_type{0}),
-                     thrust::make_counting_iterator(col.size()),
-                     out.begin(),
-                     [&h_data](auto idx) {
-                       return bit_is_set(h_data.second.data(), idx)
-                                ? numeric_to_string_precise(h_data.first[idx].count()) +
-                                    duration_suffix(h_data.first[idx])
-                                : std::string("NULL");
-                     });
-
-    } else {
-      std::transform(h_data.first.begin(), h_data.first.end(), out.begin(), [](Element el) {
-        return numeric_to_string_precise(el.count()) + duration_suffix(el);
-      });
-    }
-  }
-
-  template <typename Element, std::enable_if_t<std::is_same_v<Element, cudf::list_view>>* = nullptr>
-  void operator()(cudf::column_view const& col,
-                  std::vector<std::string>& out,
-                  std::string const& indent)
-  {
-    lists_column_view lcv(col);
-
-    // propagate slicing to the child if necessary
-    column_view child    = lcv.get_sliced_child(cudf::test::get_default_stream());
-    bool const is_sliced = lcv.offset() > 0 || child.offset() > 0;
-
-    std::string tmp =
-      get_nested_type_str(col) + (is_sliced ? "(sliced)" : "") + ":\n" + indent +
-      "Length : " + std::to_string(lcv.size()) + "\n" + indent +
-      "Offsets : " + (lcv.size() > 0 ? nested_offsets_to_string(lcv) : "") + "\n" +
-      (lcv.parent().nullable()
-         ? indent + "Null count: " + std::to_string(lcv.null_count()) + "\n" +
-             detail::to_string(bitmask_to_host(col), col.size(), indent) + "\n"
-         : "") +
-      // non-nested types don't typically display their null masks, so do it here for convenience.
-      (!is_nested(child.type()) && child.nullable()
-         ? "   " + detail::to_string(bitmask_to_host(child), child.size(), indent) + "\n"
-         : "") +
-      (detail::to_string(child, ", ", indent + "   ")) + "\n";
-
-    out.push_back(tmp);
-  }
-
-  template <typename Element,
-            std::enable_if_t<std::is_same_v<Element, cudf::struct_view>>* = nullptr>
-  void operator()(cudf::column_view const& col,
-                  std::vector<std::string>& out,
-                  std::string const& indent)
-  {
-    structs_column_view view{col};
-
-    std::ostringstream out_stream;
-
-    out_stream << get_nested_type_str(col) << ":\n"
-               << indent << "Length : " << view.size() << ":\n";
-    if (view.nullable()) {
-      out_stream << indent << "Null count: " << view.null_count() << "\n"
-                 << detail::to_string(bitmask_to_host(col), col.size(), indent) << "\n";
-    }
-
-    auto iter = thrust::make_counting_iterator(0);
-    std::transform(
-      iter,
-      iter + view.num_children(),
-      std::ostream_iterator<std::string>(out_stream, "\n"),
-      [&](size_type index) {
-        auto child = view.get_sliced_child(index, cudf::test::get_default_stream());
-
-        // non-nested types don't typically display their null masks, so do it here for convenience.
-        return (!is_nested(child.type()) && child.nullable()
-                  ? "   " + detail::to_string(bitmask_to_host(child), child.size(), indent) + "\n"
-                  : "") +
-               detail::to_string(child, ", ", indent + "   ");
-      });
-
-    out.push_back(out_stream.str());
-  }
-};
-
-}  // namespace
-
-namespace detail {
-
-/**
- * @copydoc cudf::test::detail::to_strings
- */
-std::vector<std::string> to_strings(cudf::column_view const& col, std::string const& indent)
-{
-  std::vector<std::string> reply;
-  cudf::type_dispatcher(col.type(), column_view_printer{}, col, reply, indent);
-  return reply;
-}
-
-/**
- * @copydoc cudf::test::detail::to_string(cudf::column_view, std::string, std::string)
- *
- * @param indent Indentation for all output
- */
-std::string to_string(cudf::column_view const& col,
-                      std::string const& delimiter,
-                      std::string const& indent)
-{
-  std::ostringstream buffer;
-  std::vector<std::string> h_data = to_strings(col, indent);
-
-  buffer << indent;
-  std::copy(h_data.begin(),
-            h_data.end() - (!h_data.empty()),
-            std::ostream_iterator<std::string>(buffer, delimiter.c_str()));
-  if (!h_data.empty()) buffer << h_data.back();
-
-  return buffer.str();
-}
-
-/**
- * @copydoc cudf::test::detail::to_string(std::vector<bitmask_type>, size_type, std::string)
- *
- * @param indent Indentation for all output.  See comment in `to_strings` for
- * a detailed description.
- */
-std::string to_string(std::vector<bitmask_type> const& null_mask,
-                      size_type null_mask_size,
-                      std::string const& indent)
-{
-  std::ostringstream buffer;
-  buffer << indent;
-  for (int idx = null_mask_size - 1; idx >= 0; idx--) {
-    buffer << (cudf::bit_is_set(null_mask.data(), idx) ? "1" : "0");
-  }
-  return buffer.str();
-}
-
-}  // namespace detail
-
-/**
- * @copydoc cudf::test::to_strings
- */
-std::vector<std::string> to_strings(cudf::column_view const& col)
-{
-  return detail::to_strings(col);
-}
-
-/**
- * @copydoc cudf::test::to_string(cudf::column_view, std::string)
- */
-std::string to_string(cudf::column_view const& col, std::string const& delimiter)
-{
-  return detail::to_string(col, delimiter);
-}
-
-/**
- * @copydoc cudf::test::to_string(std::vector<bitmask_type>, size_type)
- */
-std::string to_string(std::vector<bitmask_type> const& null_mask, size_type null_mask_size)
-{
-  return detail::to_string(null_mask, null_mask_size);
-}
-
-/**
- * @copydoc cudf::test::print
- */
-void print(cudf::column_view const& col, std::ostream& os, std::string const& delimiter)
-{
-  os << to_string(col, delimiter) << std::endl;
-}
-
 /**
  * @copydoc cudf::test::validate_host_masks
  */
diff --git a/cpp/tests/utilities/debug_utilities.cu b/cpp/tests/utilities/debug_utilities.cu
new file mode 100644
index 00000000000..a8a43ffb4ca
--- /dev/null
+++ b/cpp/tests/utilities/debug_utilities.cu
@@ -0,0 +1,480 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/debug_utilities.hpp>
+
+#include <cudf/detail/get_value.cuh>
+#include <cudf/dictionary/dictionary_column_view.hpp>
+#include <cudf/lists/lists_column_view.hpp>
+#include <cudf/strings/convert/convert_datetime.hpp>
+#include <cudf/structs/structs_column_view.hpp>
+#include <cudf/utilities/bit.hpp>
+#include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/type_dispatcher.hpp>
+
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/transform.h>
+
+#include <iomanip>
+#include <sstream>
+
+namespace cudf::test {
+
+// Forward declaration.
+namespace detail {
+
+/**
+ * @brief Formats a column view as a string
+ *
+ * @param col The column view
+ * @param delimiter The delimiter to put between strings
+ * @param indent Indentation for all output
+ */
+std::string to_string(cudf::column_view const& col,
+                      std::string const& delimiter,
+                      std::string const& indent = "");
+
+/**
+ * @brief Formats a null mask as a string
+ *
+ * @param null_mask The null mask buffer
+ * @param null_mask_size Size of the null mask (in rows)
+ * @param indent Indentation for all output
+ */
+std::string to_string(std::vector<bitmask_type> const& null_mask,
+                      size_type null_mask_size,
+                      std::string const& indent = "");
+
+/**
+ * @brief Convert column values to a host vector of strings
+ *
+ * Supports indentation of all output.  For example, if the displayed output of your column
+ * would be
+ *
+ * @code{.pseudo}
+ * "1,2,3,4,5"
+ * @endcode
+ * and the `indent` parameter was "   ", that indentation would be prepended to
+ * result in the output
+ * @code{.pseudo}
+ * "   1,2,3,4,5"
+ * @endcode
+ *
+ * The can be useful for displaying complex types. An example use case would be for
+ * displaying the nesting of a LIST type column (via recursion).
+ *
+ *  List<List<int>>:
+ *  Length : 3
+ *  Offsets : 0, 2, 5, 6
+ *  Children :
+ *     List<int>:
+ *     Length : 6
+ *     Offsets : 0, 2, 4, 7, 8, 9, 11
+ *     Children :
+ *        1, 2, 3, 4, 5, 6, 7, 0, 8, 9, 10
+ *
+ * @param col The column view
+ * @param indent Indentation for all output
+ */
+std::vector<std::string> to_strings(cudf::column_view const& col, std::string const& indent = "");
+
+}  // namespace detail
+
+namespace {
+
+template <typename T, std::enable_if_t<std::is_integral_v<T>>* = nullptr>
+static auto numeric_to_string_precise(T value)
+{
+  return std::to_string(value);
+}
+
+template <typename T, std::enable_if_t<std::is_floating_point_v<T>>* = nullptr>
+static auto numeric_to_string_precise(T value)
+{
+  std::ostringstream o;
+  o << std::setprecision(std::numeric_limits<T>::max_digits10) << value;
+  return o.str();
+}
+
+static auto duration_suffix(cudf::duration_D) { return " days"; }
+
+static auto duration_suffix(cudf::duration_s) { return " seconds"; }
+
+static auto duration_suffix(cudf::duration_ms) { return " milliseconds"; }
+
+static auto duration_suffix(cudf::duration_us) { return " microseconds"; }
+
+static auto duration_suffix(cudf::duration_ns) { return " nanoseconds"; }
+
+std::string get_nested_type_str(cudf::column_view const& view)
+{
+  if (view.type().id() == cudf::type_id::LIST) {
+    lists_column_view lcv(view);
+    return cudf::type_to_name(view.type()) + "<" + (get_nested_type_str(lcv.child())) + ">";
+  }
+
+  if (view.type().id() == cudf::type_id::STRUCT) {
+    std::ostringstream out;
+
+    out << cudf::type_to_name(view.type()) + "<";
+    std::transform(view.child_begin(),
+                   view.child_end(),
+                   std::ostream_iterator<std::string>(out, ","),
+                   [&out](auto const col) { return get_nested_type_str(col); });
+    out << ">";
+    return out.str();
+  }
+
+  return cudf::type_to_name(view.type());
+}
+
+template <typename NestedColumnView>
+std::string nested_offsets_to_string(NestedColumnView const& c, std::string const& delimiter = ", ")
+{
+  column_view offsets = (c.parent()).child(NestedColumnView::offsets_column_index);
+  CUDF_EXPECTS(offsets.type().id() == type_id::INT32,
+               "Column does not appear to be an offsets column");
+  CUDF_EXPECTS(offsets.offset() == 0, "Offsets column has an internal offset!");
+  size_type output_size = c.size() + 1;
+
+  // the first offset value to normalize everything against
+  size_type first =
+    cudf::detail::get_value<size_type>(offsets, c.offset(), cudf::get_default_stream());
+  rmm::device_uvector<size_type> shifted_offsets(output_size, cudf::get_default_stream());
+
+  // normalize the offset values for the column offset
+  size_type const* d_offsets = offsets.head<size_type>() + c.offset();
+  thrust::transform(
+    rmm::exec_policy(cudf::get_default_stream()),
+    d_offsets,
+    d_offsets + output_size,
+    shifted_offsets.begin(),
+    [first] __device__(int32_t offset) { return static_cast<size_type>(offset - first); });
+
+  auto const h_shifted_offsets =
+    cudf::detail::make_host_vector_sync(shifted_offsets, cudf::get_default_stream());
+  std::ostringstream buffer;
+  for (size_t idx = 0; idx < h_shifted_offsets.size(); idx++) {
+    buffer << h_shifted_offsets[idx];
+    if (idx < h_shifted_offsets.size() - 1) { buffer << delimiter; }
+  }
+  return buffer.str();
+}
+
+struct column_view_printer {
+  template <typename Element, std::enable_if_t<is_numeric<Element>()>* = nullptr>
+  void operator()(cudf::column_view const& col, std::vector<std::string>& out, std::string const&)
+  {
+    auto h_data = cudf::test::to_host<Element>(col);
+
+    out.resize(col.size());
+
+    if (col.nullable()) {
+      std::transform(thrust::make_counting_iterator(size_type{0}),
+                     thrust::make_counting_iterator(col.size()),
+                     out.begin(),
+                     [&h_data](auto idx) {
+                       return bit_is_set(h_data.second.data(), idx)
+                                ? numeric_to_string_precise(h_data.first[idx])
+                                : std::string("NULL");
+                     });
+
+    } else {
+      std::transform(h_data.first.begin(), h_data.first.end(), out.begin(), [](Element el) {
+        return numeric_to_string_precise(el);
+      });
+    }
+  }
+
+  template <typename Element, std::enable_if_t<is_timestamp<Element>()>* = nullptr>
+  void operator()(cudf::column_view const& col,
+                  std::vector<std::string>& out,
+                  std::string const& indent)
+  {
+    //  For timestamps, convert timestamp column to column of strings, then
+    //  call string version
+    std::string format = [&]() {
+      if constexpr (std::is_same_v<cudf::timestamp_s, Element>) {
+        return std::string{"%Y-%m-%dT%H:%M:%SZ"};
+      } else if constexpr (std::is_same_v<cudf::timestamp_ms, Element>) {
+        return std::string{"%Y-%m-%dT%H:%M:%S.%3fZ"};
+      } else if constexpr (std::is_same_v<cudf::timestamp_us, Element>) {
+        return std::string{"%Y-%m-%dT%H:%M:%S.%6fZ"};
+      } else if constexpr (std::is_same_v<cudf::timestamp_ns, Element>) {
+        return std::string{"%Y-%m-%dT%H:%M:%S.%9fZ"};
+      }
+      return std::string{"%Y-%m-%d"};
+    }();
+
+    auto col_as_strings = cudf::strings::from_timestamps(col, format);
+    if (col_as_strings->size() == 0) { return; }
+
+    this->template operator()<cudf::string_view>(*col_as_strings, out, indent);
+  }
+
+  template <typename Element, std::enable_if_t<cudf::is_fixed_point<Element>()>* = nullptr>
+  void operator()(cudf::column_view const& col, std::vector<std::string>& out, std::string const&)
+  {
+    auto const h_data = cudf::test::to_host<Element>(col);
+    if (col.nullable()) {
+      std::transform(thrust::make_counting_iterator(size_type{0}),
+                     thrust::make_counting_iterator(col.size()),
+                     std::back_inserter(out),
+                     [&h_data](auto idx) {
+                       return h_data.second.empty() || bit_is_set(h_data.second.data(), idx)
+                                ? static_cast<std::string>(h_data.first[idx])
+                                : std::string("NULL");
+                     });
+    } else {
+      std::transform(std::cbegin(h_data.first),
+                     std::cend(h_data.first),
+                     std::back_inserter(out),
+                     [col](auto const& fp) { return static_cast<std::string>(fp); });
+    }
+  }
+
+  template <typename Element,
+            std::enable_if_t<std::is_same_v<Element, cudf::string_view>>* = nullptr>
+  void operator()(cudf::column_view const& col, std::vector<std::string>& out, std::string const&)
+  {
+    //
+    //  Implementation for strings, call special to_host variant
+    //
+    if (col.is_empty()) return;
+    auto h_data = cudf::test::to_host<std::string>(col);
+
+    // explicitly replace some special whitespace characters with their literal equivalents
+    auto cleaned = [](std::string_view in) {
+      std::string out(in);
+      auto replace_char = [](std::string& out, char c, std::string_view repl) {
+        for (std::string::size_type pos{}; out.npos != (pos = out.find(c, pos)); pos++) {
+          out.replace(pos, 1, repl);
+        }
+      };
+      replace_char(out, '\a', "\\a");
+      replace_char(out, '\b', "\\b");
+      replace_char(out, '\f', "\\f");
+      replace_char(out, '\r', "\\r");
+      replace_char(out, '\t', "\\t");
+      replace_char(out, '\n', "\\n");
+      replace_char(out, '\v', "\\v");
+      return out;
+    };
+
+    out.resize(col.size());
+    std::transform(thrust::make_counting_iterator(size_type{0}),
+                   thrust::make_counting_iterator(col.size()),
+                   out.begin(),
+                   [&](auto idx) {
+                     return h_data.second.empty() || bit_is_set(h_data.second.data(), idx)
+                              ? cleaned(h_data.first[idx])
+                              : std::string("NULL");
+                   });
+  }
+
+  template <typename Element,
+            std::enable_if_t<std::is_same_v<Element, cudf::dictionary32>>* = nullptr>
+  void operator()(cudf::column_view const& col, std::vector<std::string>& out, std::string const&)
+  {
+    cudf::dictionary_column_view dictionary(col);
+    if (col.is_empty()) return;
+    std::vector<std::string> keys    = to_strings(dictionary.keys());
+    std::vector<std::string> indices = to_strings({dictionary.indices().type(),
+                                                   dictionary.size(),
+                                                   dictionary.indices().head(),
+                                                   dictionary.null_mask(),
+                                                   dictionary.null_count(),
+                                                   dictionary.offset()});
+    out.insert(out.end(), keys.begin(), keys.end());
+    if (!indices.empty()) {
+      std::string first = "\x08 : " + indices.front();  // use : as delimiter
+      out.push_back(first);                             // between keys and indices
+      out.insert(out.end(), indices.begin() + 1, indices.end());
+    }
+  }
+
+  // Print the tick counts with the units
+  template <typename Element, std::enable_if_t<is_duration<Element>()>* = nullptr>
+  void operator()(cudf::column_view const& col, std::vector<std::string>& out, std::string const&)
+  {
+    auto h_data = cudf::test::to_host<Element>(col);
+
+    out.resize(col.size());
+
+    if (col.nullable()) {
+      std::transform(thrust::make_counting_iterator(size_type{0}),
+                     thrust::make_counting_iterator(col.size()),
+                     out.begin(),
+                     [&h_data](auto idx) {
+                       return bit_is_set(h_data.second.data(), idx)
+                                ? numeric_to_string_precise(h_data.first[idx].count()) +
+                                    duration_suffix(h_data.first[idx])
+                                : std::string("NULL");
+                     });
+
+    } else {
+      std::transform(h_data.first.begin(), h_data.first.end(), out.begin(), [](Element el) {
+        return numeric_to_string_precise(el.count()) + duration_suffix(el);
+      });
+    }
+  }
+
+  template <typename Element, std::enable_if_t<std::is_same_v<Element, cudf::list_view>>* = nullptr>
+  void operator()(cudf::column_view const& col,
+                  std::vector<std::string>& out,
+                  std::string const& indent)
+  {
+    lists_column_view lcv(col);
+
+    // propagate slicing to the child if necessary
+    column_view child    = lcv.get_sliced_child(cudf::get_default_stream());
+    bool const is_sliced = lcv.offset() > 0 || child.offset() > 0;
+
+    std::string tmp =
+      get_nested_type_str(col) + (is_sliced ? "(sliced)" : "") + ":\n" + indent +
+      "Length : " + std::to_string(lcv.size()) + "\n" + indent +
+      "Offsets : " + (lcv.size() > 0 ? nested_offsets_to_string(lcv) : "") + "\n" +
+      (lcv.parent().nullable()
+         ? indent + "Null count: " + std::to_string(lcv.null_count()) + "\n" +
+             detail::to_string(cudf::test::bitmask_to_host(col), col.size(), indent) + "\n"
+         : "") +
+      // non-nested types don't typically display their null masks, so do it here for convenience.
+      (!is_nested(child.type()) && child.nullable()
+         ? "   " + detail::to_string(cudf::test::bitmask_to_host(child), child.size(), indent) +
+             "\n"
+         : "") +
+      (detail::to_string(child, ", ", indent + "   ")) + "\n";
+
+    out.push_back(tmp);
+  }
+
+  template <typename Element,
+            std::enable_if_t<std::is_same_v<Element, cudf::struct_view>>* = nullptr>
+  void operator()(cudf::column_view const& col,
+                  std::vector<std::string>& out,
+                  std::string const& indent)
+  {
+    structs_column_view view{col};
+
+    std::ostringstream out_stream;
+
+    out_stream << get_nested_type_str(col) << ":\n"
+               << indent << "Length : " << view.size() << ":\n";
+    if (view.nullable()) {
+      out_stream << indent << "Null count: " << view.null_count() << "\n"
+                 << detail::to_string(cudf::test::bitmask_to_host(col), col.size(), indent) << "\n";
+    }
+
+    auto iter = thrust::make_counting_iterator(0);
+    std::transform(
+      iter,
+      iter + view.num_children(),
+      std::ostream_iterator<std::string>(out_stream, "\n"),
+      [&](size_type index) {
+        auto child = view.get_sliced_child(index, cudf::get_default_stream());
+
+        // non-nested types don't typically display their null masks, so do it here for convenience.
+        return (!is_nested(child.type()) && child.nullable()
+                  ? "   " +
+                      detail::to_string(cudf::test::bitmask_to_host(child), child.size(), indent) +
+                      "\n"
+                  : "") +
+               detail::to_string(child, ", ", indent + "   ");
+      });
+
+    out.push_back(out_stream.str());
+  }
+};
+
+}  // namespace
+
+namespace detail {
+
+/**
+ * @copydoc cudf::test::detail::to_strings
+ */
+std::vector<std::string> to_strings(cudf::column_view const& col, std::string const& indent)
+{
+  std::vector<std::string> reply;
+  cudf::type_dispatcher(col.type(), column_view_printer{}, col, reply, indent);
+  return reply;
+}
+
+/**
+ * @copydoc cudf::test::detail::to_string(cudf::column_view, std::string, std::string)
+ *
+ * @param indent Indentation for all output
+ */
+std::string to_string(cudf::column_view const& col,
+                      std::string const& delimiter,
+                      std::string const& indent)
+{
+  std::ostringstream buffer;
+  std::vector<std::string> h_data = to_strings(col, indent);
+
+  buffer << indent;
+  std::copy(h_data.begin(),
+            h_data.end() - (!h_data.empty()),
+            std::ostream_iterator<std::string>(buffer, delimiter.c_str()));
+  if (!h_data.empty()) buffer << h_data.back();
+
+  return buffer.str();
+}
+
+/**
+ * @copydoc cudf::test::detail::to_string(std::vector<bitmask_type>, size_type, std::string)
+ *
+ * @param indent Indentation for all output.  See comment in `to_strings` for
+ * a detailed description.
+ */
+std::string to_string(std::vector<bitmask_type> const& null_mask,
+                      size_type null_mask_size,
+                      std::string const& indent)
+{
+  std::ostringstream buffer;
+  buffer << indent;
+  for (int idx = null_mask_size - 1; idx >= 0; idx--) {
+    buffer << (cudf::bit_is_set(null_mask.data(), idx) ? "1" : "0");
+  }
+  return buffer.str();
+}
+
+}  // namespace detail
+
+std::vector<std::string> to_strings(cudf::column_view const& col)
+{
+  return detail::to_strings(col);
+}
+
+std::string to_string(cudf::column_view const& col, std::string const& delimiter)
+{
+  return detail::to_string(col, delimiter);
+}
+
+std::string to_string(std::vector<bitmask_type> const& null_mask, size_type null_mask_size)
+{
+  return detail::to_string(null_mask, null_mask_size);
+}
+
+void print(cudf::column_view const& col, std::ostream& os)
+{
+  os << to_string(col, ",") << std::endl;
+}
+
+}  // namespace cudf::test
diff --git a/cpp/tests/utilities_tests/column_debug_tests.cpp b/cpp/tests/utilities_tests/column_debug_tests.cpp
new file mode 100644
index 00000000000..0dae407ad21
--- /dev/null
+++ b/cpp/tests/utilities_tests/column_debug_tests.cpp
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/strings/strings_column_view.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/cudf_gtest.hpp>
+#include <cudf_test/debug_utilities.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <thrust/iterator/transform_iterator.h>
+
+#include <type_traits>
+
+template <typename T>
+struct ColumnDebugTestIntegral : public cudf::test::BaseFixture {};
+template <typename T>
+struct ColumnDebugTestFloatingPoint : public cudf::test::BaseFixture {};
+
+TYPED_TEST_SUITE(ColumnDebugTestIntegral, cudf::test::IntegralTypes);
+TYPED_TEST_SUITE(ColumnDebugTestFloatingPoint, cudf::test::FloatingPointTypes);
+
+TYPED_TEST(ColumnDebugTestIntegral, PrintColumnNumeric)
+{
+  char const* delimiter = ",";
+
+  cudf::test::fixed_width_column_wrapper<TypeParam> cudf_col({1, 2, 3, 4, 5});
+  auto std_col = cudf::test::make_type_param_vector<TypeParam>({1, 2, 3, 4, 5});
+
+  std::stringstream tmp;
+  auto string_iter =
+    thrust::make_transform_iterator(std::begin(std_col), [](auto e) { return std::to_string(e); });
+
+  std::copy(string_iter,
+            string_iter + std_col.size() - 1,
+            std::ostream_iterator<std::string>(tmp, delimiter));
+
+  tmp << std::to_string(std_col.back());
+
+  EXPECT_EQ(cudf::test::to_string(cudf_col, delimiter), tmp.str());
+}
+
+TYPED_TEST(ColumnDebugTestIntegral, PrintColumnWithInvalids)
+{
+  char const* delimiter = ",";
+
+  cudf::test::fixed_width_column_wrapper<TypeParam> cudf_col{{1, 2, 3, 4, 5}, {1, 0, 1, 0, 1}};
+  auto std_col = cudf::test::make_type_param_vector<TypeParam>({1, 2, 3, 4, 5});
+
+  std::ostringstream tmp;
+  tmp << std::to_string(std_col[0]) << delimiter << "NULL" << delimiter
+      << std::to_string(std_col[2]) << delimiter << "NULL" << delimiter
+      << std::to_string(std_col[4]);
+
+  EXPECT_EQ(cudf::test::to_string(cudf_col, delimiter), tmp.str());
+}
+
+TYPED_TEST(ColumnDebugTestFloatingPoint, PrintColumnNumeric)
+{
+  char const* delimiter = ",";
+
+  cudf::test::fixed_width_column_wrapper<TypeParam> cudf_col(
+    {10001523.25, 2.0, 3.75, 0.000000034, 5.3});
+
+  auto expected = std::is_same_v<TypeParam, double>
+                    ? "10001523.25,2,3.75,3.4e-08,5.2999999999999998"
+                    : "10001523,2,3.75,3.39999993e-08,5.30000019";
+
+  EXPECT_EQ(cudf::test::to_string(cudf_col, delimiter), expected);
+}
+
+TYPED_TEST(ColumnDebugTestFloatingPoint, PrintColumnWithInvalids)
+{
+  char const* delimiter = ",";
+
+  cudf::test::fixed_width_column_wrapper<TypeParam> cudf_col(
+    {10001523.25, 2.0, 3.75, 0.000000034, 5.3}, {1, 0, 1, 0, 1});
+
+  auto expected = std::is_same_v<TypeParam, double>
+                    ? "10001523.25,NULL,3.75,NULL,5.2999999999999998"
+                    : "10001523,NULL,3.75,NULL,5.30000019";
+
+  EXPECT_EQ(cudf::test::to_string(cudf_col, delimiter), expected);
+}
+
+struct ColumnDebugStringsTest : public cudf::test::BaseFixture {};
+
+TEST_F(ColumnDebugStringsTest, PrintColumnDuration)
+{
+  char const* delimiter = ",";
+
+  cudf::test::fixed_width_column_wrapper<cudf::duration_s, int32_t> cudf_col({100, 0, 7, 140000});
+
+  auto expected = "100 seconds,0 seconds,7 seconds,140000 seconds";
+
+  EXPECT_EQ(cudf::test::to_string(cudf_col, delimiter), expected);
+}
+
+TEST_F(ColumnDebugStringsTest, StringsToString)
+{
+  char const* delimiter = ",";
+
+  std::vector<char const*> h_strings{"eee", "bb", nullptr, "", "aa", "bbb", "ééé"};
+  cudf::test::strings_column_wrapper strings(
+    h_strings.begin(),
+    h_strings.end(),
+    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
+
+  std::ostringstream tmp;
+  tmp << h_strings[0] << delimiter << h_strings[1] << delimiter << "NULL" << delimiter
+      << h_strings[3] << delimiter << h_strings[4] << delimiter << h_strings[5] << delimiter
+      << h_strings[6];
+
+  EXPECT_EQ(cudf::test::to_string(strings, delimiter), tmp.str());
+}
+
+TEST_F(ColumnDebugStringsTest, PrintEscapeStrings)
+{
+  char const* delimiter = ",";
+  cudf::test::strings_column_wrapper input({"e\te\ne", "é\bé\ré", "e\vé\fé\abell"});
+  std::string expected{"e\\te\\ne,é\\bé\\ré,e\\vé\\fé\\abell"};
+  EXPECT_EQ(cudf::test::to_string(input, delimiter), expected);
+}
diff --git a/cpp/tests/utilities_tests/column_utilities_tests.cpp b/cpp/tests/utilities_tests/column_utilities_tests.cpp
index 90a7270cb29..07d2bea2b28 100644
--- a/cpp/tests/utilities_tests/column_utilities_tests.cpp
+++ b/cpp/tests/utilities_tests/column_utilities_tests.cpp
@@ -182,106 +182,6 @@ TEST_F(ColumnUtilitiesStringsTest, StringsToHostAllNulls)
   EXPECT_TRUE(std::all_of(results.begin(), results.end(), [](auto s) { return s.empty(); }));
 }
 
-TEST_F(ColumnUtilitiesStringsTest, PrintColumnDuration)
-{
-  char const* delimiter = ",";
-
-  cudf::test::fixed_width_column_wrapper<cudf::duration_s, int32_t> cudf_col({100, 0, 7, 140000});
-
-  auto expected = "100 seconds,0 seconds,7 seconds,140000 seconds";
-
-  EXPECT_EQ(cudf::test::to_string(cudf_col, delimiter), expected);
-}
-
-TYPED_TEST(ColumnUtilitiesTestIntegral, PrintColumnNumeric)
-{
-  char const* delimiter = ",";
-
-  cudf::test::fixed_width_column_wrapper<TypeParam> cudf_col({1, 2, 3, 4, 5});
-  auto std_col = cudf::test::make_type_param_vector<TypeParam>({1, 2, 3, 4, 5});
-
-  std::stringstream tmp;
-  auto string_iter =
-    thrust::make_transform_iterator(std::begin(std_col), [](auto e) { return std::to_string(e); });
-
-  std::copy(string_iter,
-            string_iter + std_col.size() - 1,
-            std::ostream_iterator<std::string>(tmp, delimiter));
-
-  tmp << std::to_string(std_col.back());
-
-  EXPECT_EQ(cudf::test::to_string(cudf_col, delimiter), tmp.str());
-}
-
-TYPED_TEST(ColumnUtilitiesTestIntegral, PrintColumnWithInvalids)
-{
-  char const* delimiter = ",";
-
-  cudf::test::fixed_width_column_wrapper<TypeParam> cudf_col{{1, 2, 3, 4, 5}, {1, 0, 1, 0, 1}};
-  auto std_col = cudf::test::make_type_param_vector<TypeParam>({1, 2, 3, 4, 5});
-
-  std::ostringstream tmp;
-  tmp << std::to_string(std_col[0]) << delimiter << "NULL" << delimiter
-      << std::to_string(std_col[2]) << delimiter << "NULL" << delimiter
-      << std::to_string(std_col[4]);
-
-  EXPECT_EQ(cudf::test::to_string(cudf_col, delimiter), tmp.str());
-}
-
-TYPED_TEST(ColumnUtilitiesTestFloatingPoint, PrintColumnNumeric)
-{
-  char const* delimiter = ",";
-
-  cudf::test::fixed_width_column_wrapper<TypeParam> cudf_col(
-    {10001523.25, 2.0, 3.75, 0.000000034, 5.3});
-
-  auto expected = std::is_same_v<TypeParam, double>
-                    ? "10001523.25,2,3.75,3.4e-08,5.2999999999999998"
-                    : "10001523,2,3.75,3.39999993e-08,5.30000019";
-
-  EXPECT_EQ(cudf::test::to_string(cudf_col, delimiter), expected);
-}
-
-TYPED_TEST(ColumnUtilitiesTestFloatingPoint, PrintColumnWithInvalids)
-{
-  char const* delimiter = ",";
-
-  cudf::test::fixed_width_column_wrapper<TypeParam> cudf_col(
-    {10001523.25, 2.0, 3.75, 0.000000034, 5.3}, {1, 0, 1, 0, 1});
-
-  auto expected = std::is_same_v<TypeParam, double>
-                    ? "10001523.25,NULL,3.75,NULL,5.2999999999999998"
-                    : "10001523,NULL,3.75,NULL,5.30000019";
-
-  EXPECT_EQ(cudf::test::to_string(cudf_col, delimiter), expected);
-}
-
-TEST_F(ColumnUtilitiesStringsTest, StringsToString)
-{
-  char const* delimiter = ",";
-
-  std::vector<char const*> h_strings{"eee", "bb", nullptr, "", "aa", "bbb", "ééé"};
-  cudf::test::strings_column_wrapper strings(
-    h_strings.begin(),
-    h_strings.end(),
-    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
-
-  std::ostringstream tmp;
-  tmp << h_strings[0] << delimiter << h_strings[1] << delimiter << "NULL" << delimiter
-      << h_strings[3] << delimiter << h_strings[4] << delimiter << h_strings[5] << delimiter
-      << h_strings[6];
-
-  EXPECT_EQ(cudf::test::to_string(strings, delimiter), tmp.str());
-}
-
-TEST_F(ColumnUtilitiesStringsTest, PrintEscapeStrings)
-{
-  char const* delimiter = ",";
-  cudf::test::strings_column_wrapper input({"e\te\ne", "é\bé\ré", "e\vé\fé\abell"});
-  std::string expected{"e\\te\\ne,é\\bé\\ré,e\\vé\\fé\\abell"};
-  EXPECT_EQ(cudf::test::to_string(input, delimiter), expected);
-}
-
 TYPED_TEST(ColumnUtilitiesTestFixedPoint, NonNullableToHost)
 {
   using namespace numeric;
diff --git a/dependencies.yaml b/dependencies.yaml
index 72aaaa1b3fc..97149a5e2ba 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -9,8 +9,8 @@ files:
       - build_all
       - build_cpp
       - build_wheels
-      - build_python
       - build_python_common
+      - build_python_cudf
       - cudatoolkit
       - develop
       - docs
@@ -62,6 +62,7 @@ files:
     includes:
       - cudatoolkit
       - docs
+      - libarrow_run
       - py_version
   py_build_cudf:
     output: pyproject
@@ -70,8 +71,8 @@ files:
       table: build-system
     includes:
       - build_all
-      - build_python
       - build_python_common
+      - build_python_cudf
       - build_wheels
   py_run_cudf:
     output: pyproject
@@ -137,8 +138,8 @@ files:
     extras:
       table: build-system
     includes:
-      - build_wheels
       - build_python_common
+      - build_wheels
   py_run_cudf_kafka:
     output: pyproject
     pyproject_dir: python/cudf_kafka
@@ -230,8 +231,8 @@ dependencies:
     common:
       - output_types: [conda, requirements]
         packages:
-          - librmm==23.10.*
-          - libkvikio==23.10.*
+          - librmm==23.12.*
+          - libkvikio==23.12.*
       - output_types: conda
         packages:
           - fmt>=9.1.0,<10
@@ -240,27 +241,11 @@ dependencies:
           - &gmock gmock>=1.13.0
           # Hard pin the patch version used during the build. This must be kept
           # in sync with the version pinned in get_arrow.cmake.
-          - libarrow==12.0.1.*
+          - libarrow-all==14.0.1.*
           - librdkafka>=1.9.0,<1.10.0a0
+          # Align nvcomp version with rapids-cmake
+          - nvcomp==3.0.4
           - spdlog>=1.11.0,<1.12
-    specific:
-      - output_types: conda
-        matrices:
-          - matrix:
-              arch: x86_64
-            packages:
-              # Align nvcomp version with rapids-cmake
-              # TODO: not yet available for aarch64 CUDA 12
-              - &nvcomp nvcomp==2.6.1
-          - matrix:
-              arch: aarch64
-              cuda: "11.8"
-            packages:
-              - *nvcomp
-          # TODO: Fallback matrix for aarch64 CUDA 12. After migrating to nvcomp 3,
-          # all CUDA/arch combinations should be supported by existing packages.
-          - matrix:
-            packages:
   build_wheels:
     common:
       - output_types: pyproject
@@ -271,18 +256,20 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - cython>=3.0.0
-          # Hard pin the patch version used during the build. This must be kept
-          # in sync with the version pinned in get_arrow.cmake.
-          - pyarrow==12.0.1.*
+          - cython>=3.0.3
           # TODO: Pin to numpy<1.25 until cudf requires pandas 2
           - &numpy numpy>=1.21,<1.25
-  build_python:
+          - scikit-build>=0.13.1
+      - output_types: [conda, requirements, pyproject]
+        packages:
+          # Hard pin the patch version used during the build. This must be kept
+          # in sync with the version pinned in get_arrow.cmake.
+          - pyarrow==14.0.1.*
+  build_python_cudf:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - scikit-build>=0.13.1
-          - rmm==23.10.*
+          - rmm==23.12.*
       - output_types: conda
         packages:
           - &protobuf protobuf>=4.21,<5
@@ -291,16 +278,18 @@ dependencies:
           - protoc-wheel
   libarrow_run:
     common:
-      - output_types: [conda, requirements]
+      - output_types: conda
         packages:
           # Allow runtime version to float up to minor version
-          - libarrow==12.*
+          # Disallow libarrow 14.0.0 due to a CVE
+          - libarrow-all>=14.0.1,<15.0.0a0
   pyarrow_run:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
           # Allow runtime version to float up to minor version
-          - pyarrow==12.*
+          # Disallow pyarrow 14.0.0 due to a CVE
+          - pyarrow>=14.0.1,<15.0.0a0
   cudatoolkit:
     specific:
       - output_types: conda
@@ -401,15 +390,15 @@ dependencies:
     common:
       - output_types: [conda]
         packages:
-          - dask-cuda==23.10.*
+          - dask-cuda==23.12.*
           - *doxygen
-          - libarrow==12.0.1.*
           - make
           - myst-nb
           - nbsphinx
           - numpydoc
           - pandoc
-          - pydata-sphinx-theme
+          # https://github.com/pydata/pydata-sphinx-theme/issues/1539
+          - pydata-sphinx-theme!=0.14.2
           - scipy
           - sphinx
           - sphinx-autobuild
@@ -455,7 +444,7 @@ dependencies:
           - nvtx>=0.2.1
           - packaging
           - rich
-          - rmm==23.10.*
+          - rmm==23.12.*
           - typing_extensions>=4.0.0
           - *protobuf
       - output_types: conda
@@ -508,15 +497,13 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - dask==2023.9.2
-          - distributed==2023.9.2
+          - rapids-dask-dependency==23.12.*
       - output_types: conda
         packages:
           - cupy>=12.0.0
-          - dask-core==2023.9.2  # dask-core in conda is the actual package & dask is the meta package
       - output_types: pyproject
         packages:
-          - &cudf cudf==23.10.*
+          - &cudf cudf==23.12.*
           - *cupy_pip
   run_cudf_kafka:
     common:
@@ -535,7 +522,7 @@ dependencies:
         packages:
           - confluent-kafka>=1.9.0,<1.10.0a0
           - *cudf
-          - cudf_kafka==23.10.*
+          - cudf_kafka==23.12.*
   test_cpp:
     common:
       - output_types: conda
@@ -580,7 +567,6 @@ dependencies:
           - fastavro>=0.22.9
           - hypothesis
           - mimesis>=4.1.0
-          - pyorc
           - pytest-benchmark
           - pytest-cases
           - python-snappy>=0.6.0
@@ -618,7 +604,7 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - dask-cuda==23.10.*
+          - dask-cuda==23.12.*
           - *numba
   test_python_pandas_cudf:
     common:
diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py
index 03b1bb7039b..28e305b71cb 100644
--- a/docs/cudf/source/conf.py
+++ b/docs/cudf/source/conf.py
@@ -79,9 +79,9 @@
 # built documents.
 #
 # The short X.Y version.
-version = '23.10'
+version = '23.12'
 # The full version, including alpha/beta/rc tags.
-release = '23.10.00'
+release = '23.12.00'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
@@ -106,6 +106,7 @@
     "twitter_url": "https://twitter.com/rapidsai",
     "show_toc_level": 1,
     "navbar_align": "right",
+    "navigation_with_keys": True,
 }
 include_pandas_compat = True
 
diff --git a/docs/cudf/source/user_guide/data-types.md b/docs/cudf/source/user_guide/data-types.md
index 1f4cfbc7366..e6fe3109c57 100644
--- a/docs/cudf/source/user_guide/data-types.md
+++ b/docs/cudf/source/user_guide/data-types.md
@@ -136,7 +136,7 @@ dtype: struct
 StructDtype({'a': dtype('int64'), 'b': dtype('int64')})
 ```
 
-Or by reading them from disk, using a [file format that supports nested data](io).
+Or by reading them from disk, using a [file format that supports nested data](/user_guide/io/index.md).
 
 ```python
 >>> pdf = pd.DataFrame({"a": [[1, 2], [3, 4, 5], [6, 7, 8]]})
diff --git a/docs/dask_cudf/source/conf.py b/docs/dask_cudf/source/conf.py
index ad629b5e949..00568a57431 100644
--- a/docs/dask_cudf/source/conf.py
+++ b/docs/dask_cudf/source/conf.py
@@ -11,8 +11,8 @@
 project = "dask-cudf"
 copyright = "2018-2023, NVIDIA Corporation"
 author = "NVIDIA Corporation"
-version = '23.10'
-release = '23.10.00'
+version = '23.12'
+release = '23.12.00'
 
 language = "en"
 
@@ -57,6 +57,7 @@
     "twitter_url": "https://twitter.com/rapidsai",
     "show_toc_level": 1,
     "navbar_align": "right",
+    "navigation_with_keys": True,
 }
 include_pandas_compat = True
 
diff --git a/fetch_rapids.cmake b/fetch_rapids.cmake
index 4a68c7dbc60..e79d9d86fce 100644
--- a/fetch_rapids.cmake
+++ b/fetch_rapids.cmake
@@ -12,7 +12,7 @@
 # the License.
 # =============================================================================
 if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/CUDF_RAPIDS.cmake)
-  file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-23.10/RAPIDS.cmake
+  file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-23.12/RAPIDS.cmake
        ${CMAKE_CURRENT_BINARY_DIR}/CUDF_RAPIDS.cmake
   )
 endif()
diff --git a/java/ci/README.md b/java/ci/README.md
index e9599b33bf1..12a2bb2dc51 100644
--- a/java/ci/README.md
+++ b/java/ci/README.md
@@ -34,7 +34,7 @@ nvidia-docker run -it cudf-build:11.8.0-devel-centos7 bash
 You can download the cuDF repo in the docker container or you can mount it into the container.
 Here I choose to download again in the container.
 ```bash
-git clone --recursive https://github.com/rapidsai/cudf.git -b branch-23.10
+git clone --recursive https://github.com/rapidsai/cudf.git -b branch-23.12
 ```
 
 ### Build cuDF jar with devtoolset
@@ -47,4 +47,4 @@ scl enable devtoolset-11 "java/ci/build-in-docker.sh"
 
 ### The output
 
-You can find the cuDF jar in java/target/ like cudf-23.10.0-SNAPSHOT-cuda11.jar.
+You can find the cuDF jar in java/target/ like cudf-23.12.0-SNAPSHOT-cuda11.jar.
diff --git a/java/pom.xml b/java/pom.xml
index afcc0e15a2c..cc880312d34 100644
--- a/java/pom.xml
+++ b/java/pom.xml
@@ -21,7 +21,7 @@
 
     <groupId>ai.rapids</groupId>
     <artifactId>cudf</artifactId>
-    <version>23.10.0-SNAPSHOT</version>
+    <version>23.12.0-SNAPSHOT</version>
 
     <name>cudfjni</name>
     <description>
diff --git a/java/src/main/java/ai/rapids/cudf/Cuda.java b/java/src/main/java/ai/rapids/cudf/Cuda.java
index e1298e29925..7cc3d30a9cf 100755
--- a/java/src/main/java/ai/rapids/cudf/Cuda.java
+++ b/java/src/main/java/ai/rapids/cudf/Cuda.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,9 +15,6 @@
  */
 package ai.rapids.cudf;
 
-import ai.rapids.cudf.NvtxColor;
-import ai.rapids.cudf.NvtxRange;
-
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -90,6 +87,21 @@ private Stream() {
       this.id = -1;
     }
 
+    private Stream(long id) {
+      this.cleaner = null;
+      this.id = id;
+    }
+
+    /**
+     * Wrap a given stream ID to make it accessible.
+     */
+    static Stream wrap(long id) {
+      if (id == -1) {
+        return DEFAULT_STREAM;
+      }
+      return new Stream(id);
+    }
+
     /**
      * Have this stream not execute new work until the work recorded in event completes.
      * @param event the event to wait on.
@@ -122,7 +134,9 @@ public synchronized void close() {
         cleaner.delRef();
       }
       if (closed) {
-        cleaner.logRefCountDebug("double free " + this);
+        if (cleaner != null) {
+          cleaner.logRefCountDebug("double free " + this);
+        }
         throw new IllegalStateException("Close called too many times " + this);
       }
       if (cleaner != null) {
diff --git a/java/src/main/java/ai/rapids/cudf/DataSource.java b/java/src/main/java/ai/rapids/cudf/DataSource.java
new file mode 100644
index 00000000000..1e5893235df
--- /dev/null
+++ b/java/src/main/java/ai/rapids/cudf/DataSource.java
@@ -0,0 +1,189 @@
+/*
+ *
+ *  Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ */
+
+package ai.rapids.cudf;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.HashMap;
+
+/**
+ * Base class that can be used to provide data dynamically to CUDF. This follows somewhat
+ * closely with cudf::io::datasource. There are a few main differences.
+ * <br/>
+ * First this does not expose async device reads. It will call the non-async device read API
+ * instead. This might be added in the future, but there was no direct use case for it in java
+ * right now to warrant the added complexity.
+ * <br/>
+ * Second there is no implementation of the device read API that returns a buffer instead of
+ * writing into one. This is not used by CUDF yet so testing an implementation that isn't used
+ * didn't feel ideal. If it is needed we will add one in the future.
+ */
+public abstract class DataSource implements AutoCloseable {
+  private static final Logger log = LoggerFactory.getLogger(DataSource.class);
+
+  /**
+   * This is used to keep track of the HostMemoryBuffers in java land so the C++ layer
+   * does not have to do it.
+   */
+  private final HashMap<Long, HostMemoryBuffer> cachedBuffers = new HashMap<>();
+
+  @Override
+  public void close() {
+    if (!cachedBuffers.isEmpty()) {
+      throw new IllegalStateException("DataSource closed before all returned host buffers were closed");
+    }
+  }
+
+  /**
+   * Get the size of the source in bytes.
+   */
+  public abstract long size();
+
+  /**
+   * Read data from the source at the given offset. Return a HostMemoryBuffer for the data
+   * that was read.
+   * @param offset where to start reading from.
+   * @param amount the maximum number of bytes to read.
+   * @return a buffer that points to the data.
+   * @throws IOException on any error.
+   */
+  public abstract HostMemoryBuffer hostRead(long offset, long amount) throws IOException;
+
+
+  /**
+   * Called when the buffer returned from hostRead is done. The default is to close the buffer.
+   */
+  protected void onHostBufferDone(HostMemoryBuffer buffer) {
+    if (buffer != null) {
+      buffer.close();
+    }
+  }
+
+  /**
+   * Read data from the source at the given offset into dest. Note that dest should not be closed,
+   * and no reference to it can outlive the call to hostRead. The target amount to read is
+   * dest's length.
+   * @param offset the offset to start reading from in the source.
+   * @param dest where to write the data.
+   * @return the actual number of bytes written to dest.
+   */
+  public abstract long hostRead(long offset, HostMemoryBuffer dest) throws IOException;
+
+  /**
+   * Return true if this supports reading directly to the device else false. The default is
+   * no device support. This cannot change dynamically. It is typically read just once.
+   */
+  public boolean supportsDeviceRead() {
+    return false;
+  }
+
+  /**
+   * Get the size cutoff between device reads and host reads when device reads are supported.
+   * Anything larger than the cutoff will be a device read and anything smaller will be a
+   * host read. By default, the cutoff is 0 so all reads will be device reads if device reads
+   * are supported.
+   */
+  public long getDeviceReadCutoff() {
+    return 0;
+  }
+
+  /**
+   * Read data from the source at the given offset into dest. Note that dest should not be closed,
+   * and no reference to it can outlive the call to hostRead. The target amount to read is
+   * dest's length.
+   * @param offset the offset to start reading from
+   * @param dest where to write the data.
+   * @param stream the stream to do the copy on.
+   * @return the actual number of bytes written to dest.
+   */
+  public long deviceRead(long offset, DeviceMemoryBuffer dest,
+                         Cuda.Stream stream) throws IOException {
+    throw new IllegalStateException("Device read is not implemented");
+  }
+
+  /////////////////////////////////////////////////
+  // Internal methods called from JNI
+  /////////////////////////////////////////////////
+
+  private static class NoopCleaner extends MemoryBuffer.MemoryBufferCleaner {
+    @Override
+    protected boolean cleanImpl(boolean logErrorIfNotClean) {
+      return true;
+    }
+
+    @Override
+    public boolean isClean() {
+      return true;
+    }
+  }
+  private static final NoopCleaner cleaner = new NoopCleaner();
+
+  // Called from JNI
+  private void onHostBufferDone(long bufferId) {
+    HostMemoryBuffer hmb = cachedBuffers.remove(bufferId);
+    if (hmb != null) {
+      onHostBufferDone(hmb);
+    } else {
+      // Called from C++ destructor so avoid throwing...
+      log.warn("Got a close callback for a buffer we could not find " + bufferId);
+    }
+  }
+
+  // Called from JNI
+  private long hostRead(long offset, long amount, long dst) throws IOException {
+    if (amount < 0) {
+      throw new IllegalArgumentException("Cannot allocate more than " + Long.MAX_VALUE + " bytes");
+    }
+    try (HostMemoryBuffer dstBuffer = new HostMemoryBuffer(dst, amount, cleaner)) {
+      return hostRead(offset, dstBuffer);
+    }
+  }
+
+  // Called from JNI
+  private long[] hostReadBuff(long offset, long amount) throws IOException {
+    if (amount < 0) {
+      throw new IllegalArgumentException("Cannot read more than " + Long.MAX_VALUE + " bytes");
+    }
+    HostMemoryBuffer buff = hostRead(offset, amount);
+    long[] ret = new long[3];
+    if (buff != null) {
+      long id = buff.id;
+      if (cachedBuffers.put(id, buff) != null) {
+        throw new IllegalStateException("Already had a buffer cached for " + buff);
+      }
+      ret[0] = buff.address;
+      ret[1] = buff.length;
+      ret[2] = id;
+    } // else they are all 0 because java does that already
+    return ret;
+  }
+
+  // Called from JNI
+  private long deviceRead(long offset, long amount, long dst, long stream) throws IOException {
+    if (amount < 0) {
+      throw new IllegalArgumentException("Cannot read more than " + Long.MAX_VALUE + " bytes");
+    }
+    Cuda.Stream strm = Cuda.Stream.wrap(stream);
+    try (DeviceMemoryBuffer dstBuffer = new DeviceMemoryBuffer(dst, amount, cleaner)) {
+      return deviceRead(offset, dstBuffer, strm);
+    }
+  }
+}
diff --git a/java/src/main/java/ai/rapids/cudf/DataSourceHelper.java b/java/src/main/java/ai/rapids/cudf/DataSourceHelper.java
new file mode 100644
index 00000000000..5d4dcb8e4e7
--- /dev/null
+++ b/java/src/main/java/ai/rapids/cudf/DataSourceHelper.java
@@ -0,0 +1,44 @@
+/*
+ *
+ *  Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ */
+
+package ai.rapids.cudf;
+
+/**
+ * This is here because we need some JNI methods to work with a DataSource, but
+ * we also want to cache callback methods at startup for performance reasons. If
+ * we put both in the same class we will get a deadlock because of how we load
+ * the JNI. We have a static block that blocks loading the class until the JNI
+ * library is loaded and the JNI library cannot load until the class is loaded
+ * and cached. This breaks the loop.
+ */
+class DataSourceHelper {
+    static {
+        NativeDepsLoader.loadNativeDeps();
+    }
+
+    static long createWrapperDataSource(DataSource ds) {
+        return createWrapperDataSource(ds, ds.size(), ds.supportsDeviceRead(),
+                ds.getDeviceReadCutoff());
+    }
+
+    private static native long createWrapperDataSource(DataSource ds, long size,
+                                                       boolean deviceReadSupport,
+                                                       long deviceReadCutoff);
+
+    static native void destroyWrapperDataSource(long handle);
+}
diff --git a/java/src/main/java/ai/rapids/cudf/DeviceMemoryBuffer.java b/java/src/main/java/ai/rapids/cudf/DeviceMemoryBuffer.java
index c4d9bdb8f91..9eab607ed0b 100644
--- a/java/src/main/java/ai/rapids/cudf/DeviceMemoryBuffer.java
+++ b/java/src/main/java/ai/rapids/cudf/DeviceMemoryBuffer.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *  Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -112,6 +112,10 @@ public static DeviceMemoryBuffer fromRmm(long address, long lengthInBytes, long
     return new DeviceMemoryBuffer(address, lengthInBytes, rmmBufferAddress);
   }
 
+  DeviceMemoryBuffer(long address, long lengthInBytes, MemoryBufferCleaner cleaner) {
+    super(address, lengthInBytes, cleaner);
+  }
+
   DeviceMemoryBuffer(long address, long lengthInBytes, long rmmBufferAddress) {
     super(address, lengthInBytes, new RmmDeviceBufferCleaner(rmmBufferAddress));
   }
diff --git a/java/src/main/java/ai/rapids/cudf/MultiBufferDataSource.java b/java/src/main/java/ai/rapids/cudf/MultiBufferDataSource.java
new file mode 100644
index 00000000000..6986b6a7fec
--- /dev/null
+++ b/java/src/main/java/ai/rapids/cudf/MultiBufferDataSource.java
@@ -0,0 +1,230 @@
+/*
+ *
+ *  Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ */
+
+package ai.rapids.cudf;
+
+/**
+ * This is a DataSource that can take multiple HostMemoryBuffers. They
+ * are treated as if they are all part of a single file connected end to end.
+ */
+public class MultiBufferDataSource extends DataSource {
+  private final long sizeInBytes;
+  private final HostMemoryBuffer[] hostBuffers;
+  private final long[] startOffsets;
+  private final HostMemoryAllocator allocator;
+
+  // Metrics
+  private long hostReads = 0;
+  private long hostReadBytes = 0;
+  private long devReads = 0;
+  private long devReadBytes = 0;
+
+  /**
+   * Create a new data source backed by multiple buffers.
+   * @param buffers the buffers that will back the data source.
+   */
+  public MultiBufferDataSource(HostMemoryBuffer ... buffers) {
+    this(DefaultHostMemoryAllocator.get(), buffers);
+  }
+
+  /**
+   * Create a new data source backed by multiple buffers.
+   * @param allocator the allocator to use for host buffers, if needed.
+   * @param buffers the buffers that will back the data source.
+   */
+  public MultiBufferDataSource(HostMemoryAllocator allocator, HostMemoryBuffer ... buffers) {
+    int numBuffers = buffers.length;
+    hostBuffers = new HostMemoryBuffer[numBuffers];
+    startOffsets = new long[numBuffers];
+
+    long currentOffset = 0;
+    for (int i = 0; i < numBuffers; i++) {
+      HostMemoryBuffer hmb = buffers[i];
+      hmb.incRefCount();
+      hostBuffers[i] = hmb;
+      startOffsets[i] = currentOffset;
+      currentOffset += hmb.getLength();
+    }
+    sizeInBytes = currentOffset;
+    this.allocator = allocator;
+  }
+
+  @Override
+  public long size() {
+    return sizeInBytes;
+  }
+
+  private int getStartBufferIndexForOffset(long offset) {
+    assert (offset >= 0);
+
+    // It is super common to read from the start or end of a file (the header or footer)
+    // so special case them
+    if (offset == 0) {
+      return 0;
+    }
+    int startIndex = 0;
+    int endIndex = startOffsets.length - 1;
+    if (offset >= startOffsets[endIndex]) {
+      return endIndex;
+    }
+    while (startIndex != endIndex) {
+      int midIndex = (int)(((long)startIndex + endIndex) / 2);
+      long midStartOffset = startOffsets[midIndex];
+      if (offset >= midStartOffset) {
+        // It is either in mid or after mid.
+        if (midIndex == endIndex || offset <= startOffsets[midIndex + 1]) {
+          // We found it in mid
+          return midIndex;
+        } else {
+          // It is after mid
+          startIndex = midIndex + 1;
+        }
+      } else {
+        // It is before mid
+        endIndex = midIndex - 1;
+      }
+    }
+    return startIndex;
+  }
+
+
+  interface DoCopy<T extends MemoryBuffer> {
+    void copyFromHostBuffer(T dest, long destOffset, HostMemoryBuffer src,
+                            long srcOffset, long srcAmount);
+  }
+
+  private <T extends MemoryBuffer> long read(long offset, T dest, DoCopy<T> doCopy) {
+    assert (offset >= 0);
+    long realOffset = Math.min(offset, sizeInBytes);
+    long realAmount = Math.min(sizeInBytes - realOffset, dest.getLength());
+
+    int index = getStartBufferIndexForOffset(realOffset);
+
+    HostMemoryBuffer buffer = hostBuffers[index];
+    long bufferOffset = realOffset - startOffsets[index];
+    long bufferAmount = Math.min(buffer.length - bufferOffset, realAmount);
+    long remainingAmount = realAmount;
+    long currentOffset = realOffset;
+    long outputOffset = 0;
+
+    while (remainingAmount > 0) {
+      doCopy.copyFromHostBuffer(dest, outputOffset, buffer,
+          bufferOffset, bufferAmount);
+      remainingAmount -= bufferAmount;
+      outputOffset += bufferAmount;
+      currentOffset += bufferAmount;
+      index++;
+      if (index < hostBuffers.length) {
+        buffer = hostBuffers[index];
+        bufferOffset = currentOffset - startOffsets[index];
+        bufferAmount = Math.min(buffer.length - bufferOffset, remainingAmount);
+      }
+    }
+
+    return realAmount;
+  }
+
+  @Override
+  public HostMemoryBuffer hostRead(long offset, long amount) {
+    assert (offset >= 0);
+    assert (amount >= 0);
+    long realOffset = Math.min(offset, sizeInBytes);
+    long realAmount = Math.min(sizeInBytes - realOffset, amount);
+
+    int index = getStartBufferIndexForOffset(realOffset);
+
+    HostMemoryBuffer buffer = hostBuffers[index];
+    long bufferOffset = realOffset - startOffsets[index];
+    long bufferAmount = Math.min(buffer.length - bufferOffset, realAmount);
+    if (bufferAmount == realAmount) {
+      hostReads += 1;
+      hostReadBytes += realAmount;
+      // It all fits in a single buffer, so do a zero copy operation
+      return buffer.slice(bufferOffset, bufferAmount);
+    } else {
+      // We will have to allocate a new buffer and copy data into it.
+      boolean success = false;
+      HostMemoryBuffer ret = allocator.allocate(realAmount, true);
+      try {
+        long amountRead = read(offset, ret, HostMemoryBuffer::copyFromHostBuffer);
+        assert(amountRead == realAmount);
+        hostReads += 1;
+        hostReadBytes += amountRead;
+        success = true;
+        return ret;
+      } finally {
+        if (!success) {
+          ret.close();
+        }
+      }
+    }
+  }
+
+  @Override
+  public long hostRead(long offset, HostMemoryBuffer dest) {
+    long ret = read(offset, dest, HostMemoryBuffer::copyFromHostBuffer);
+    hostReads += 1;
+    hostReadBytes += ret;
+    return ret;
+  }
+
+  @Override
+  public boolean supportsDeviceRead() {
+    return true;
+  }
+
+  @Override
+  public long deviceRead(long offset, DeviceMemoryBuffer dest,
+                         Cuda.Stream stream) {
+    long ret = read(offset, dest, (destParam, destOffset, src, srcOffset, srcAmount) ->
+        destParam.copyFromHostBufferAsync(destOffset, src, srcOffset, srcAmount, stream));
+    devReads += 1;
+    devReadBytes += ret;
+    return ret;
+  }
+
+
+  @Override
+  public void close() {
+    try {
+      super.close();
+    } finally {
+      for (HostMemoryBuffer hmb: hostBuffers) {
+        if (hmb != null) {
+          hmb.close();
+        }
+      }
+    }
+  }
+
+  public long getHostReads() {
+    return hostReads;
+  }
+
+  public long getHostReadBytes() {
+    return hostReadBytes;
+  }
+
+  public long getDevReads() {
+    return devReads;
+  }
+
+  public long getDevReadBytes() {
+    return devReadBytes;
+  }
+}
diff --git a/java/src/main/java/ai/rapids/cudf/ParquetChunkedReader.java b/java/src/main/java/ai/rapids/cudf/ParquetChunkedReader.java
index c34336ac73f..17d59b757c3 100644
--- a/java/src/main/java/ai/rapids/cudf/ParquetChunkedReader.java
+++ b/java/src/main/java/ai/rapids/cudf/ParquetChunkedReader.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2022, NVIDIA CORPORATION.
+ *  Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -51,7 +51,7 @@ public ParquetChunkedReader(long chunkSizeByteLimit, ParquetOptions opts, File f
     handle = create(chunkSizeByteLimit, opts.getIncludeColumnNames(), opts.getReadBinaryAsString(),
         filePath.getAbsolutePath(), 0, 0, opts.timeUnit().typeId.getNativeId());
 
-    if(handle == 0) {
+    if (handle == 0) {
       throw new IllegalStateException("Cannot create native chunked Parquet reader object.");
     }
   }
@@ -71,18 +71,45 @@ public ParquetChunkedReader(long chunkSizeByteLimit, ParquetOptions opts, HostMe
     handle = create(chunkSizeByteLimit, opts.getIncludeColumnNames(), opts.getReadBinaryAsString(), null,
         buffer.getAddress() + offset, len, opts.timeUnit().typeId.getNativeId());
 
-    if(handle == 0) {
+    if (handle == 0) {
       throw new IllegalStateException("Cannot create native chunked Parquet reader object.");
     }
   }
 
+  /**
+   * Construct a reader instance from a DataSource
+   * @param chunkSizeByteLimit Limit on total number of bytes to be returned per read,
+   *                           or 0 if there is no limit.
+   * @param opts The options for Parquet reading.
+   * @param ds the data source to read from
+   */
+  public ParquetChunkedReader(long chunkSizeByteLimit, ParquetOptions opts, DataSource ds) {
+    dataSourceHandle = DataSourceHelper.createWrapperDataSource(ds);
+    if (dataSourceHandle == 0) {
+      throw new IllegalStateException("Cannot create native datasource object");
+    }
+
+    boolean passed = false;
+    try {
+      handle = createWithDataSource(chunkSizeByteLimit, opts.getIncludeColumnNames(),
+              opts.getReadBinaryAsString(), opts.timeUnit().typeId.getNativeId(),
+              dataSourceHandle);
+      passed = true;
+    } finally {
+      if (!passed) {
+        DataSourceHelper.destroyWrapperDataSource(dataSourceHandle);
+        dataSourceHandle = 0;
+      }
+    }
+  }
+
   /**
    * Check if the given file has anything left to read.
    *
    * @return A boolean value indicating if there is more data to read from file.
    */
   public boolean hasNext() {
-    if(handle == 0) {
+    if (handle == 0) {
       throw new IllegalStateException("Native chunked Parquet reader object may have been closed.");
     }
 
@@ -104,7 +131,7 @@ public boolean hasNext() {
    * @return A table of new rows reading from the given file.
    */
   public Table readChunk() {
-    if(handle == 0) {
+    if (handle == 0) {
       throw new IllegalStateException("Native chunked Parquet reader object may have been closed.");
     }
 
@@ -118,6 +145,10 @@ public void close() {
       close(handle);
       handle = 0;
     }
+    if (dataSourceHandle != 0) {
+      DataSourceHelper.destroyWrapperDataSource(dataSourceHandle);
+      dataSourceHandle = 0;
+    }
   }
 
 
@@ -131,6 +162,7 @@ public void close() {
    */
   private long handle;
 
+  private long dataSourceHandle = 0;
 
   /**
    * Create a native chunked Parquet reader object on heap and return its memory address.
@@ -147,6 +179,9 @@ public void close() {
   private static native long create(long chunkSizeByteLimit, String[] filterColumnNames,
       boolean[] binaryToString, String filePath, long bufferAddrs, long length, int timeUnit);
 
+  private static native long createWithDataSource(long chunkedSizeByteLimit,
+      String[] filterColumnNames, boolean[] binaryToString, int timeUnit, long dataSourceHandle);
+
   private static native boolean hasNext(long handle);
 
   private static native long[] readChunk(long handle);
diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index 51a33ebb72f..3bd1e3f25a7 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -235,6 +235,14 @@ private static native long[] readCSV(String[] columnNames,
                                        byte comment, String[] nullValues,
                                        String[] trueValues, String[] falseValues) throws CudfException;
 
+  private static native long[] readCSVFromDataSource(String[] columnNames,
+                                       int[] dTypeIds, int[] dTypeScales,
+                                       String[] filterColumnNames,
+                                       int headerRow, byte delim, int quoteStyle, byte quote,
+                                       byte comment, String[] nullValues,
+                                       String[] trueValues, String[] falseValues,
+                                       long dataSourceHandle) throws CudfException;
+
   /**
    * read JSON data and return a pointer to a TableWithMeta object.
    */
@@ -244,6 +252,12 @@ private static native long readJSON(String[] columnNames,
                                         boolean dayFirst, boolean lines,
                                         boolean recoverWithNulls) throws CudfException;
 
+  private static native long readJSONFromDataSource(String[] columnNames,
+                                      int[] dTypeIds, int[] dTypeScales,
+                                      boolean dayFirst, boolean lines,
+                                      boolean recoverWithNulls,
+                                      long dsHandle) throws CudfException;
+
   private static native long readAndInferJSON(long address, long length,
       boolean dayFirst, boolean lines, boolean recoverWithNulls) throws CudfException;
 
@@ -260,6 +274,10 @@ private static native long readAndInferJSON(long address, long length,
   private static native long[] readParquet(String[] filterColumnNames, boolean[] binaryToString, String filePath,
                                            long address, long length, int timeUnit) throws CudfException;
 
+  private static native long[] readParquetFromDataSource(String[] filterColumnNames,
+                                                         boolean[] binaryToString, int timeUnit,
+                                                         long dataSourceHandle) throws CudfException;
+
   /**
    * Read in Avro formatted data.
    * @param filterColumnNames  name of the columns to read, or an empty array if we want to read
@@ -271,6 +289,9 @@ private static native long[] readParquet(String[] filterColumnNames, boolean[] b
   private static native long[] readAvro(String[] filterColumnNames, String filePath,
                                         long address, long length) throws CudfException;
 
+  private static native long[] readAvroFromDataSource(String[] filterColumnNames,
+                                                      long dataSourceHandle) throws CudfException;
+
   /**
    * Setup everything to write parquet formatted data to a file.
    * @param columnNames     names that correspond to the table columns
@@ -372,6 +393,11 @@ private static native long[] readORC(String[] filterColumnNames,
                                        boolean usingNumPyTypes, int timeUnit,
                                        String[] decimal128Columns) throws CudfException;
 
+  private static native long[] readORCFromDataSource(String[] filterColumnNames,
+                                                     boolean usingNumPyTypes, int timeUnit,
+                                                     String[] decimal128Columns,
+                                                     long dataSourceHandle) throws CudfException;
+
   /**
    * Setup everything to write ORC formatted data to a file.
    * @param columnNames     names that correspond to the table columns
@@ -881,6 +907,27 @@ public static Table readCSV(Schema schema, CSVOptions opts, HostMemoryBuffer buf
         opts.getFalseValues()));
   }
 
+  public static Table readCSV(Schema schema, CSVOptions opts, DataSource ds) {
+    long dsHandle = DataSourceHelper.createWrapperDataSource(ds);
+    try {
+      return new Table(readCSVFromDataSource(schema.getColumnNames(),
+              schema.getTypeIds(),
+              schema.getTypeScales(),
+              opts.getIncludeColumnNames(),
+              opts.getHeaderRow(),
+              opts.getDelim(),
+              opts.getQuoteStyle().nativeId,
+              opts.getQuote(),
+              opts.getComment(),
+              opts.getNullValues(),
+              opts.getTrueValues(),
+              opts.getFalseValues(),
+              dsHandle));
+    } finally {
+      DataSourceHelper.destroyWrapperDataSource(dsHandle);
+    }
+  }
+
   private static native void writeCSVToFile(long table,
                                             String[] columnNames,
                                             boolean includeHeader,
@@ -1128,6 +1175,24 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b
     }
   }
 
+  /**
+   * Read JSON formatted data.
+   * @param schema the schema of the data. You may use Schema.INFERRED to infer the schema.
+   * @param opts various JSON parsing options.
+   * @param ds the DataSource to read from.
+   * @return the data parsed as a table on the GPU.
+   */
+  public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds) {
+    long dsHandle = DataSourceHelper.createWrapperDataSource(ds);
+    try (TableWithMeta twm = new TableWithMeta(readJSONFromDataSource(schema.getColumnNames(),
+            schema.getTypeIds(), schema.getTypeScales(), opts.isDayFirst(), opts.isLines(),
+            opts.isRecoverWithNull(), dsHandle))) {
+      return gatherJSONColumns(schema, twm);
+    } finally {
+      DataSourceHelper.destroyWrapperDataSource(dsHandle);
+    }
+  }
+
   /**
    * Read a Parquet file using the default ParquetOptions.
    * @param path the local file to read.
@@ -1214,6 +1279,17 @@ public static Table readParquet(ParquetOptions opts, HostMemoryBuffer buffer,
         null, buffer.getAddress() + offset, len, opts.timeUnit().typeId.getNativeId()));
   }
 
+  public static Table readParquet(ParquetOptions opts, DataSource ds) {
+    long dataSourceHandle = DataSourceHelper.createWrapperDataSource(ds);
+    try {
+      return new Table(readParquetFromDataSource(opts.getIncludeColumnNames(),
+              opts.getReadBinaryAsString(), opts.timeUnit().typeId.getNativeId(),
+              dataSourceHandle));
+    } finally {
+      DataSourceHelper.destroyWrapperDataSource(dataSourceHandle);
+    }
+  }
+
   /**
    * Read an Avro file using the default AvroOptions.
    * @param path the local file to read.
@@ -1297,6 +1373,16 @@ public static Table readAvro(AvroOptions opts, HostMemoryBuffer buffer,
         null, buffer.getAddress() + offset, len));
   }
 
+  public static Table readAvro(AvroOptions opts, DataSource ds) {
+    long dataSourceHandle = DataSourceHelper.createWrapperDataSource(ds);
+    try {
+      return new Table(readAvroFromDataSource(opts.getIncludeColumnNames(),
+              dataSourceHandle));
+    } finally {
+      DataSourceHelper.destroyWrapperDataSource(dataSourceHandle);
+    }
+  }
+
   /**
    * Read a ORC file using the default ORCOptions.
    * @param path the local file to read.
@@ -1388,6 +1474,17 @@ public static Table readORC(ORCOptions opts, HostMemoryBuffer buffer,
         opts.getDecimal128Columns()));
   }
 
+  public static Table readORC(ORCOptions opts, DataSource ds) {
+    long dataSourceHandle = DataSourceHelper.createWrapperDataSource(ds);
+    try {
+      return new Table(readORCFromDataSource(opts.getIncludeColumnNames(),
+              opts.usingNumPyTypes(), opts.timeUnit().typeId.getNativeId(),
+              opts.getDecimal128Columns(), dataSourceHandle));
+    } finally {
+      DataSourceHelper.destroyWrapperDataSource(dataSourceHandle);
+    }
+  }
+
   private static class ParquetTableWriter extends TableWriter {
     HostBufferConsumer consumer;
 
@@ -2262,7 +2359,7 @@ public Table dropDuplicates(int[] keyColumns, DuplicateKeepOption keep, boolean
 
   /**
    * Count how many rows in the table are distinct from one another.
-   * @param nullEqual if nulls should be considered equal to each other or not.
+   * @param nullsEqual if nulls should be considered equal to each other or not.
    */
   public int distinctCount(NullEquality nullsEqual) {
     return distinctCount(nativeHandle, nullsEqual.nullsEqual);
diff --git a/java/src/main/java/ai/rapids/cudf/ast/Literal.java b/java/src/main/java/ai/rapids/cudf/ast/Literal.java
index 427dd286b0c..4e1e886c282 100644
--- a/java/src/main/java/ai/rapids/cudf/ast/Literal.java
+++ b/java/src/main/java/ai/rapids/cudf/ast/Literal.java
@@ -20,6 +20,7 @@
 
 import java.nio.ByteBuffer;
 import java.nio.ByteOrder;
+import java.nio.charset.StandardCharsets;
 
 /** A literal value in an AST expression. */
 public final class Literal extends AstExpression {
@@ -205,7 +206,14 @@ public static Literal ofString(String value) {
     if (value == null) {
       return ofNull(DType.STRING);
     }
-    byte[] stringBytes = value.getBytes();
+    return ofUTF8String(value.getBytes(StandardCharsets.UTF_8));
+  }
+
+  /** Construct a string literal directly with byte array to skip transcoding. */
+  public static Literal ofUTF8String(byte[] stringBytes) {
+    if (stringBytes == null) {
+      return ofNull(DType.STRING);
+    }
     byte[] serializedValue = new byte[stringBytes.length + Integer.BYTES];
     ByteBuffer.wrap(serializedValue).order(ByteOrder.nativeOrder()).putInt(stringBytes.length);
     System.arraycopy(stringBytes, 0, serializedValue, Integer.BYTES, stringBytes.length);
diff --git a/java/src/main/native/CMakeLists.txt b/java/src/main/native/CMakeLists.txt
index 128989fe77f..01161a03dd4 100644
--- a/java/src/main/native/CMakeLists.txt
+++ b/java/src/main/native/CMakeLists.txt
@@ -28,7 +28,7 @@ rapids_cuda_init_architectures(CUDF_JNI)
 
 project(
   CUDF_JNI
-  VERSION 23.10.00
+  VERSION 23.12.00
   LANGUAGES C CXX CUDA
 )
 
@@ -135,6 +135,7 @@ add_library(
   src/ColumnViewJni.cu
   src/CompiledExpression.cpp
   src/ContiguousTableJni.cpp
+  src/DataSourceHelperJni.cpp
   src/HashJoinJni.cpp
   src/HostMemoryBufferNativeUtilsJni.cpp
   src/NvcompJni.cpp
diff --git a/java/src/main/native/src/ChunkedReaderJni.cpp b/java/src/main/native/src/ChunkedReaderJni.cpp
index 8d0a8bdbfe7..0044385f267 100644
--- a/java/src/main/native/src/ChunkedReaderJni.cpp
+++ b/java/src/main/native/src/ChunkedReaderJni.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -85,6 +85,40 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ParquetChunkedReader_create(
   CATCH_STD(env, 0);
 }
 
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ParquetChunkedReader_createWithDataSource(
+    JNIEnv *env, jclass, jlong chunk_read_limit, jobjectArray filter_col_names,
+    jbooleanArray j_col_binary_read, jint unit, jlong ds_handle) {
+  JNI_NULL_CHECK(env, j_col_binary_read, "Null col_binary_read", 0);
+  JNI_NULL_CHECK(env, ds_handle, "Null DataSouurce", 0);
+
+  try {
+    cudf::jni::auto_set_device(env);
+
+    cudf::jni::native_jstringArray n_filter_col_names(env, filter_col_names);
+
+    // TODO: This variable is unused now, but we still don't know what to do with it yet.
+    // As such, it needs to stay here for a little more time before we decide to use it again,
+    // or remove it completely.
+    cudf::jni::native_jbooleanArray n_col_binary_read(env, j_col_binary_read);
+    (void)n_col_binary_read;
+
+    auto ds = reinterpret_cast<cudf::io::datasource *>(ds_handle);
+    cudf::io::source_info source{ds};
+
+    auto opts_builder = cudf::io::parquet_reader_options::builder(source);
+    if (n_filter_col_names.size() > 0) {
+      opts_builder = opts_builder.columns(n_filter_col_names.as_cpp_vector());
+    }
+    auto const read_opts = opts_builder.convert_strings_to_categories(false)
+                               .timestamp_type(cudf::data_type(static_cast<cudf::type_id>(unit)))
+                               .build();
+
+    return reinterpret_cast<jlong>(new cudf::io::chunked_parquet_reader(
+        static_cast<std::size_t>(chunk_read_limit), read_opts));
+  }
+  CATCH_STD(env, 0);
+}
+
 JNIEXPORT jboolean JNICALL Java_ai_rapids_cudf_ParquetChunkedReader_hasNext(JNIEnv *env, jclass,
                                                                             jlong handle) {
   JNI_NULL_CHECK(env, handle, "handle is null", false);
diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp
index 0ddaa2c15b5..7a626daff1f 100644
--- a/java/src/main/native/src/ColumnViewJni.cpp
+++ b/java/src/main/native/src/ColumnViewJni.cpp
@@ -27,6 +27,7 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/filling.hpp>
 #include <cudf/hashing.hpp>
+#include <cudf/json/json.hpp>
 #include <cudf/lists/combine.hpp>
 #include <cudf/lists/contains.hpp>
 #include <cudf/lists/count_elements.hpp>
@@ -62,7 +63,6 @@
 #include <cudf/strings/extract.hpp>
 #include <cudf/strings/find.hpp>
 #include <cudf/strings/findall.hpp>
-#include <cudf/strings/json.hpp>
 #include <cudf/strings/padding.hpp>
 #include <cudf/strings/regex/regex_program.hpp>
 #include <cudf/strings/repeat_strings.hpp>
@@ -1130,7 +1130,11 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_castTo(JNIEnv *env, jclas
     }
     if (n_data_type.id() == cudf::type_id::STRING) {
       switch (column->type().id()) {
-        case cudf::type_id::BOOL8: return release_as_jlong(cudf::strings::from_booleans(*column));
+        case cudf::type_id::BOOL8: {
+          auto const true_scalar = cudf::string_scalar("true");
+          auto const false_scalar = cudf::string_scalar("false");
+          return release_as_jlong(cudf::strings::from_booleans(*column, true_scalar, false_scalar));
+        }
         case cudf::type_id::FLOAT32:
         case cudf::type_id::FLOAT64: return release_as_jlong(cudf::strings::from_floats(*column));
         case cudf::type_id::INT8:
@@ -1149,7 +1153,10 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_castTo(JNIEnv *env, jclas
       }
     } else if (column->type().id() == cudf::type_id::STRING) {
       switch (n_data_type.id()) {
-        case cudf::type_id::BOOL8: return release_as_jlong(cudf::strings::to_booleans(*column));
+        case cudf::type_id::BOOL8: {
+          auto const true_scalar = cudf::string_scalar("true");
+          return release_as_jlong(cudf::strings::to_booleans(*column, true_scalar));
+        }
         case cudf::type_id::FLOAT32:
         case cudf::type_id::FLOAT64:
           return release_as_jlong(cudf::strings::to_floats(*column, n_data_type));
@@ -2436,7 +2443,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getJSONObject(JNIEnv *env
     cudf::column_view *n_column_view = reinterpret_cast<cudf::column_view *>(j_view_handle);
     cudf::strings_column_view n_strings_col_view(*n_column_view);
     cudf::string_scalar *n_scalar_path = reinterpret_cast<cudf::string_scalar *>(j_scalar_handle);
-    return release_as_jlong(cudf::strings::get_json_object(n_strings_col_view, *n_scalar_path));
+    return release_as_jlong(cudf::get_json_object(n_strings_col_view, *n_scalar_path));
   }
   CATCH_STD(env, 0)
 }
diff --git a/java/src/main/native/src/CudfJni.cpp b/java/src/main/native/src/CudfJni.cpp
index 0f143086451..d0a25d449a6 100644
--- a/java/src/main/native/src/CudfJni.cpp
+++ b/java/src/main/native/src/CudfJni.cpp
@@ -175,6 +175,14 @@ JNIEXPORT jint JNI_OnLoad(JavaVM *vm, void *) {
     return JNI_ERR;
   }
 
+  if (!cudf::jni::cache_data_source_jni(env)) {
+    if (!env->ExceptionCheck()) {
+      env->ThrowNew(env->FindClass("java/lang/RuntimeException"),
+                    "Unable to locate data source helper methods needed by JNI");
+    }
+    return JNI_ERR;
+  }
+
   return cudf::jni::MINIMUM_JNI_VERSION;
 }
 
diff --git a/java/src/main/native/src/DataSourceHelperJni.cpp b/java/src/main/native/src/DataSourceHelperJni.cpp
new file mode 100644
index 00000000000..8d0e4d36413
--- /dev/null
+++ b/java/src/main/native/src/DataSourceHelperJni.cpp
@@ -0,0 +1,237 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/io/datasource.hpp>
+
+#include "cudf_jni_apis.hpp"
+#include "jni_utils.hpp"
+
+namespace {
+
+#define DATA_SOURCE_CLASS "ai/rapids/cudf/DataSource"
+
+jclass DataSource_jclass;
+jmethodID hostRead_method;
+jmethodID hostReadBuff_method;
+jmethodID onHostBufferDone_method;
+jmethodID deviceRead_method;
+
+} // anonymous namespace
+
+namespace cudf {
+namespace jni {
+bool cache_data_source_jni(JNIEnv *env) {
+  jclass cls = env->FindClass(DATA_SOURCE_CLASS);
+  if (cls == nullptr) {
+    return false;
+  }
+
+  hostRead_method = env->GetMethodID(cls, "hostRead", "(JJJ)J");
+  if (hostRead_method == nullptr) {
+    return false;
+  }
+
+  hostReadBuff_method = env->GetMethodID(cls, "hostReadBuff", "(JJ)[J");
+  if (hostReadBuff_method == nullptr) {
+    return false;
+  }
+
+  onHostBufferDone_method = env->GetMethodID(cls, "onHostBufferDone", "(J)V");
+  if (onHostBufferDone_method == nullptr) {
+    return false;
+  }
+
+  deviceRead_method = env->GetMethodID(cls, "deviceRead", "(JJJJ)J");
+  if (deviceRead_method == nullptr) {
+    return false;
+  }
+
+  // Convert local reference to global so it cannot be garbage collected.
+  DataSource_jclass = static_cast<jclass>(env->NewGlobalRef(cls));
+  if (DataSource_jclass == nullptr) {
+    return false;
+  }
+  return true;
+}
+
+void release_data_source_jni(JNIEnv *env) {
+  DataSource_jclass = cudf::jni::del_global_ref(env, DataSource_jclass);
+}
+
+class host_buffer_done_callback {
+public:
+  explicit host_buffer_done_callback(JavaVM *jvm, jobject ds, long id) : jvm(jvm), ds(ds), id(id) {}
+
+  host_buffer_done_callback(host_buffer_done_callback const &other) = delete;
+  host_buffer_done_callback(host_buffer_done_callback &&other)
+      : jvm(other.jvm), ds(other.ds), id(other.id) {
+    other.jvm = nullptr;
+    other.ds = nullptr;
+    other.id = -1;
+  }
+
+  host_buffer_done_callback &operator=(host_buffer_done_callback &&other) = delete;
+  host_buffer_done_callback &operator=(host_buffer_done_callback const &other) = delete;
+
+  ~host_buffer_done_callback() {
+    // because we are in a destructor we cannot throw an exception, so for now we are
+    // just going to keep the java exceptions around and have them be thrown when this
+    // thread returns to the JVM. It might be kind of confusing, but we will not lose
+    // them.
+    if (jvm != nullptr) {
+      // We cannot throw an exception in the destructor, so this is really best effort
+      JNIEnv *env = nullptr;
+      if (jvm->GetEnv(reinterpret_cast<void **>(&env), cudf::jni::MINIMUM_JNI_VERSION) == JNI_OK) {
+        env->CallVoidMethod(this->ds, onHostBufferDone_method, id);
+      }
+    }
+  }
+
+private:
+  JavaVM *jvm;
+  jobject ds;
+  long id;
+};
+
+class jni_datasource : public cudf::io::datasource {
+public:
+  explicit jni_datasource(JNIEnv *env, jobject ds, size_t ds_size, bool device_read_supported,
+                          size_t device_read_cutoff)
+      : ds_size(ds_size), device_read_supported(device_read_supported),
+        device_read_cutoff(device_read_cutoff) {
+    if (env->GetJavaVM(&jvm) < 0) {
+      throw std::runtime_error("GetJavaVM failed");
+    }
+    this->ds = add_global_ref(env, ds);
+  }
+
+  virtual ~jni_datasource() {
+    JNIEnv *env = nullptr;
+    if (jvm->GetEnv(reinterpret_cast<void **>(&env), cudf::jni::MINIMUM_JNI_VERSION) == JNI_OK) {
+      ds = del_global_ref(env, ds);
+    }
+    ds = nullptr;
+  }
+
+  std::unique_ptr<buffer> host_read(size_t offset, size_t size) override {
+    JNIEnv *env = nullptr;
+    if (jvm->GetEnv(reinterpret_cast<void **>(&env), cudf::jni::MINIMUM_JNI_VERSION) != JNI_OK) {
+      throw cudf::jni::jni_exception("Could not load JNIEnv");
+    }
+
+    jlongArray jbuffer_info =
+        static_cast<jlongArray>(env->CallObjectMethod(this->ds, hostReadBuff_method, offset, size));
+    if (env->ExceptionOccurred()) {
+      throw cudf::jni::jni_exception("Java exception in hostRead");
+    }
+
+    cudf::jni::native_jlongArray buffer_info(env, jbuffer_info);
+    auto ptr = reinterpret_cast<uint8_t *>(buffer_info[0]);
+    size_t length = buffer_info[1];
+    long id = buffer_info[2];
+
+    cudf::jni::host_buffer_done_callback cb(this->jvm, this->ds, id);
+    return std::make_unique<owning_buffer<cudf::jni::host_buffer_done_callback>>(std::move(cb), ptr,
+                                                                                 length);
+  }
+
+  size_t host_read(size_t offset, size_t size, uint8_t *dst) override {
+    JNIEnv *env = nullptr;
+    if (jvm->GetEnv(reinterpret_cast<void **>(&env), cudf::jni::MINIMUM_JNI_VERSION) != JNI_OK) {
+      throw cudf::jni::jni_exception("Could not load JNIEnv");
+    }
+
+    jlong amount_read =
+        env->CallLongMethod(this->ds, hostRead_method, offset, size, reinterpret_cast<jlong>(dst));
+    if (env->ExceptionOccurred()) {
+      throw cudf::jni::jni_exception("Java exception in hostRead");
+    }
+    return amount_read;
+  }
+
+  size_t size() const override { return ds_size; }
+
+  bool supports_device_read() const override { return device_read_supported; }
+
+  bool is_device_read_preferred(size_t size) const override {
+    return device_read_supported && size >= device_read_cutoff;
+  }
+
+  size_t device_read(size_t offset, size_t size, uint8_t *dst,
+                     rmm::cuda_stream_view stream) override {
+    JNIEnv *env = nullptr;
+    if (jvm->GetEnv(reinterpret_cast<void **>(&env), cudf::jni::MINIMUM_JNI_VERSION) != JNI_OK) {
+      throw cudf::jni::jni_exception("Could not load JNIEnv");
+    }
+
+    jlong amount_read =
+        env->CallLongMethod(this->ds, deviceRead_method, offset, size, reinterpret_cast<jlong>(dst),
+                            reinterpret_cast<jlong>(stream.value()));
+    if (env->ExceptionOccurred()) {
+      throw cudf::jni::jni_exception("Java exception in deviceRead");
+    }
+    return amount_read;
+  }
+
+  std::future<size_t> device_read_async(size_t offset, size_t size, uint8_t *dst,
+                                        rmm::cuda_stream_view stream) override {
+    auto amount_read = device_read(offset, size, dst, stream);
+    // This is a bit ugly, but we don't have a good way or a need to return
+    // a future for the read
+    std::promise<size_t> ret;
+    ret.set_value(amount_read);
+    return ret.get_future();
+  }
+
+private:
+  size_t ds_size;
+  bool device_read_supported;
+  size_t device_read_cutoff;
+  JavaVM *jvm;
+  jobject ds;
+};
+} // namespace jni
+} // namespace cudf
+
+extern "C" {
+
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_DataSourceHelper_createWrapperDataSource(
+    JNIEnv *env, jclass, jobject ds, jlong ds_size, jboolean device_read_supported,
+    jlong device_read_cutoff) {
+  JNI_NULL_CHECK(env, ds, "Null data source", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    auto source =
+        new cudf::jni::jni_datasource(env, ds, ds_size, device_read_supported, device_read_cutoff);
+    return reinterpret_cast<jlong>(source);
+  }
+  CATCH_STD(env, 0);
+}
+
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_DataSourceHelper_destroyWrapperDataSource(JNIEnv *env,
+                                                                                     jclass,
+                                                                                     jlong handle) {
+  try {
+    cudf::jni::auto_set_device(env);
+    if (handle != 0) {
+      auto source = reinterpret_cast<cudf::jni::jni_datasource *>(handle);
+      delete (source);
+    }
+  }
+  CATCH_STD(env, );
+}
+
+} // extern "C"
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index b208ef8f381..fad19bdf895 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -1135,6 +1135,67 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_merge(JNIEnv *env, jclass
   CATCH_STD(env, NULL);
 }
 
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readCSVFromDataSource(
+    JNIEnv *env, jclass, jobjectArray col_names, jintArray j_types, jintArray j_scales,
+    jobjectArray filter_col_names, jint header_row, jbyte delim, jint j_quote_style, jbyte quote,
+    jbyte comment, jobjectArray null_values, jobjectArray true_values, jobjectArray false_values,
+    jlong ds_handle) {
+  JNI_NULL_CHECK(env, null_values, "null_values must be supplied, even if it is empty", NULL);
+  JNI_NULL_CHECK(env, ds_handle, "no data source handle given", NULL);
+
+  try {
+    cudf::jni::auto_set_device(env);
+    cudf::jni::native_jstringArray n_col_names(env, col_names);
+    cudf::jni::native_jintArray n_types(env, j_types);
+    cudf::jni::native_jintArray n_scales(env, j_scales);
+    if (n_types.is_null() != n_scales.is_null()) {
+      JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "types and scales must match null",
+                    NULL);
+    }
+    std::vector<cudf::data_type> data_types;
+    if (!n_types.is_null()) {
+      if (n_types.size() != n_scales.size()) {
+        JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "types and scales must match size",
+                      NULL);
+      }
+      data_types.reserve(n_types.size());
+      std::transform(n_types.begin(), n_types.end(), n_scales.begin(),
+                     std::back_inserter(data_types), [](auto type, auto scale) {
+                       return cudf::data_type{static_cast<cudf::type_id>(type), scale};
+                     });
+    }
+
+    cudf::jni::native_jstringArray n_null_values(env, null_values);
+    cudf::jni::native_jstringArray n_true_values(env, true_values);
+    cudf::jni::native_jstringArray n_false_values(env, false_values);
+    cudf::jni::native_jstringArray n_filter_col_names(env, filter_col_names);
+
+    auto ds = reinterpret_cast<cudf::io::datasource *>(ds_handle);
+    cudf::io::source_info source{ds};
+
+    auto const quote_style = static_cast<cudf::io::quote_style>(j_quote_style);
+
+    cudf::io::csv_reader_options opts = cudf::io::csv_reader_options::builder(source)
+                                            .delimiter(delim)
+                                            .header(header_row)
+                                            .names(n_col_names.as_cpp_vector())
+                                            .dtypes(data_types)
+                                            .use_cols_names(n_filter_col_names.as_cpp_vector())
+                                            .true_values(n_true_values.as_cpp_vector())
+                                            .false_values(n_false_values.as_cpp_vector())
+                                            .na_values(n_null_values.as_cpp_vector())
+                                            .keep_default_na(false)
+                                            .na_filter(n_null_values.size() > 0)
+                                            .quoting(quote_style)
+                                            .quotechar(quote)
+                                            .comment(comment)
+                                            .build();
+
+    return convert_table_for_return(env, cudf::io::read_csv(opts).tbl);
+  }
+  CATCH_STD(env, NULL);
+}
+
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readCSV(
     JNIEnv *env, jclass, jobjectArray col_names, jintArray j_types, jintArray j_scales,
     jobjectArray filter_col_names, jstring inputfilepath, jlong buffer, jlong buffer_length,
@@ -1407,6 +1468,72 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_TableWithMeta_releaseTable(JNIE
   CATCH_STD(env, nullptr);
 }
 
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSONFromDataSource(
+    JNIEnv *env, jclass, jobjectArray col_names, jintArray j_types, jintArray j_scales,
+    jboolean day_first, jboolean lines, jboolean recover_with_null, jlong ds_handle) {
+
+  JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0);
+
+  try {
+    cudf::jni::auto_set_device(env);
+    cudf::jni::native_jstringArray n_col_names(env, col_names);
+    cudf::jni::native_jintArray n_types(env, j_types);
+    cudf::jni::native_jintArray n_scales(env, j_scales);
+    if (n_types.is_null() != n_scales.is_null()) {
+      JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "types and scales must match null",
+                    0);
+    }
+    std::vector<cudf::data_type> data_types;
+    if (!n_types.is_null()) {
+      if (n_types.size() != n_scales.size()) {
+        JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "types and scales must match size",
+                      0);
+      }
+      data_types.reserve(n_types.size());
+      std::transform(n_types.begin(), n_types.end(), n_scales.begin(),
+                     std::back_inserter(data_types), [](auto const &type, auto const &scale) {
+                       return cudf::data_type{static_cast<cudf::type_id>(type), scale};
+                     });
+    }
+
+    auto ds = reinterpret_cast<cudf::io::datasource *>(ds_handle);
+    cudf::io::source_info source{ds};
+
+    cudf::io::json_recovery_mode_t recovery_mode =
+        recover_with_null ? cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL :
+                            cudf::io::json_recovery_mode_t::FAIL;
+    cudf::io::json_reader_options_builder opts = cudf::io::json_reader_options::builder(source)
+                                                     .dayfirst(static_cast<bool>(day_first))
+                                                     .lines(static_cast<bool>(lines))
+                                                     .recovery_mode(recovery_mode);
+
+    if (!n_col_names.is_null() && data_types.size() > 0) {
+      if (n_col_names.size() != n_types.size()) {
+        JNI_THROW_NEW(env, "java/lang/IllegalArgumentException",
+                      "types and column names must match size", 0);
+      }
+
+      std::map<std::string, cudf::data_type> map;
+
+      auto col_names_vec = n_col_names.as_cpp_vector();
+      std::transform(col_names_vec.begin(), col_names_vec.end(), data_types.begin(),
+                     std::inserter(map, map.end()),
+                     [](std::string a, cudf::data_type b) { return std::make_pair(a, b); });
+      opts.dtypes(map);
+    } else if (data_types.size() > 0) {
+      opts.dtypes(data_types);
+    } else {
+      // should infer the types
+    }
+
+    auto result =
+        std::make_unique<cudf::io::table_with_metadata>(cudf::io::read_json(opts.build()));
+
+    return reinterpret_cast<jlong>(result.release());
+  }
+  CATCH_STD(env, 0);
+}
+
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(
     JNIEnv *env, jclass, jobjectArray col_names, jintArray j_types, jintArray j_scales,
     jstring inputfilepath, jlong buffer, jlong buffer_length, jboolean day_first, jboolean lines,
@@ -1489,6 +1616,36 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(
   CATCH_STD(env, 0);
 }
 
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquetFromDataSource(
+    JNIEnv *env, jclass, jobjectArray filter_col_names, jbooleanArray j_col_binary_read, jint unit,
+    jlong ds_handle) {
+
+  JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0);
+  JNI_NULL_CHECK(env, j_col_binary_read, "null col_binary_read", 0);
+
+  try {
+    cudf::jni::auto_set_device(env);
+
+    cudf::jni::native_jstringArray n_filter_col_names(env, filter_col_names);
+    cudf::jni::native_jbooleanArray n_col_binary_read(env, j_col_binary_read);
+
+    auto ds = reinterpret_cast<cudf::io::datasource *>(ds_handle);
+    cudf::io::source_info source{ds};
+
+    auto builder = cudf::io::parquet_reader_options::builder(source);
+    if (n_filter_col_names.size() > 0) {
+      builder = builder.columns(n_filter_col_names.as_cpp_vector());
+    }
+
+    cudf::io::parquet_reader_options opts =
+        builder.convert_strings_to_categories(false)
+            .timestamp_type(cudf::data_type(static_cast<cudf::type_id>(unit)))
+            .build();
+    return convert_table_for_return(env, cudf::io::read_parquet(opts).tbl);
+  }
+  CATCH_STD(env, NULL);
+}
+
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquet(
     JNIEnv *env, jclass, jobjectArray filter_col_names, jbooleanArray j_col_binary_read,
     jstring inputfilepath, jlong buffer, jlong buffer_length, jint unit) {
@@ -1535,10 +1692,31 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquet(
   CATCH_STD(env, NULL);
 }
 
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readAvroFromDataSource(
+    JNIEnv *env, jclass, jobjectArray filter_col_names, jlong ds_handle) {
+
+  JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0);
+
+  try {
+    cudf::jni::auto_set_device(env);
+
+    cudf::jni::native_jstringArray n_filter_col_names(env, filter_col_names);
+
+    auto ds = reinterpret_cast<cudf::io::datasource *>(ds_handle);
+    cudf::io::source_info source{ds};
+
+    cudf::io::avro_reader_options opts = cudf::io::avro_reader_options::builder(source)
+                                             .columns(n_filter_col_names.as_cpp_vector())
+                                             .build();
+    return convert_table_for_return(env, cudf::io::read_avro(opts).tbl);
+  }
+  CATCH_STD(env, NULL);
+}
+
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readAvro(JNIEnv *env, jclass,
                                                                 jobjectArray filter_col_names,
                                                                 jstring inputfilepath, jlong buffer,
-                                                                jlong buffer_length, jint unit) {
+                                                                jlong buffer_length) {
 
   const bool read_buffer = (buffer != 0);
   if (!read_buffer) {
@@ -1715,6 +1893,38 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeParquetEnd(JNIEnv *env, jc
   CATCH_STD(env, )
 }
 
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readORCFromDataSource(
+    JNIEnv *env, jclass, jobjectArray filter_col_names, jboolean usingNumPyTypes, jint unit,
+    jobjectArray dec128_col_names, jlong ds_handle) {
+
+  JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0);
+
+  try {
+    cudf::jni::auto_set_device(env);
+
+    cudf::jni::native_jstringArray n_filter_col_names(env, filter_col_names);
+
+    cudf::jni::native_jstringArray n_dec128_col_names(env, dec128_col_names);
+
+    auto ds = reinterpret_cast<cudf::io::datasource *>(ds_handle);
+    cudf::io::source_info source{ds};
+
+    auto builder = cudf::io::orc_reader_options::builder(source);
+    if (n_filter_col_names.size() > 0) {
+      builder = builder.columns(n_filter_col_names.as_cpp_vector());
+    }
+
+    cudf::io::orc_reader_options opts =
+        builder.use_index(false)
+            .use_np_dtypes(static_cast<bool>(usingNumPyTypes))
+            .timestamp_type(cudf::data_type(static_cast<cudf::type_id>(unit)))
+            .decimal128_columns(n_dec128_col_names.as_cpp_vector())
+            .build();
+    return convert_table_for_return(env, cudf::io::read_orc(opts).tbl);
+  }
+  CATCH_STD(env, NULL);
+}
+
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readORC(
     JNIEnv *env, jclass, jobjectArray filter_col_names, jstring inputfilepath, jlong buffer,
     jlong buffer_length, jboolean usingNumPyTypes, jint unit, jobjectArray dec128_col_names) {
diff --git a/java/src/main/native/src/cudf_jni_apis.hpp b/java/src/main/native/src/cudf_jni_apis.hpp
index 867df80b722..bd82bbd2899 100644
--- a/java/src/main/native/src/cudf_jni_apis.hpp
+++ b/java/src/main/native/src/cudf_jni_apis.hpp
@@ -134,5 +134,13 @@ void auto_set_device(JNIEnv *env);
  */
 void device_memset_async(JNIEnv *env, rmm::device_buffer &buf, char value);
 
+//
+// DataSource APIs
+//
+
+bool cache_data_source_jni(JNIEnv *env);
+
+void release_data_source_jni(JNIEnv *env);
+
 } // namespace jni
 } // namespace cudf
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index faa73ac4322..b0dd4122b0e 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -327,6 +327,25 @@ void testReadJSONFile() {
     }
   }
 
+  @Test
+  void testReadJSONFromDataSource() throws IOException {
+    Schema schema = Schema.builder()
+            .column(DType.STRING, "name")
+            .column(DType.INT32, "age")
+            .build();
+    JSONOptions opts = JSONOptions.builder()
+            .withLines(true)
+            .build();
+    try (Table expected = new Table.TestBuilder()
+            .column("Michael", "Andy", "Justin")
+            .column(null, 30, 19)
+            .build();
+         MultiBufferDataSource source = sourceFrom(TEST_SIMPLE_JSON_FILE);
+         Table table = Table.readJSON(schema, opts, source)) {
+      assertTablesAreEqual(expected, table);
+    }
+  }
+
   @Test
   void testReadJSONFileWithInvalidLines() {
     Schema schema = Schema.builder()
@@ -560,6 +579,126 @@ void testReadCSVBuffer() {
     }
   }
 
+  byte[][] sliceBytes(byte[] data, int slices) {
+    slices = Math.min(data.length, slices);
+    // We are not going to worry about making it super even here.
+    // The last one gets the extras.
+    int bytesPerSlice = data.length / slices;
+    byte[][] ret = new byte[slices][];
+    int startingAt = 0;
+    for (int i = 0; i < (slices - 1); i++) {
+      ret[i] = new byte[bytesPerSlice];
+      System.arraycopy(data, startingAt, ret[i], 0, bytesPerSlice);
+      startingAt += bytesPerSlice;
+    }
+    // Now for the last one
+    ret[slices - 1] = new byte[data.length - startingAt];
+    System.arraycopy(data, startingAt, ret[slices - 1], 0, data.length - startingAt);
+    return ret;
+  }
+
+  @Test
+  void testReadCSVBufferMultiBuffer() {
+    CSVOptions opts = CSVOptions.builder()
+            .includeColumn("A")
+            .includeColumn("B")
+            .hasHeader()
+            .withDelim('|')
+            .withQuote('\'')
+            .withNullValue("NULL")
+            .build();
+    byte[][] data = sliceBytes(CSV_DATA_BUFFER, 10);
+    try (Table expected = new Table.TestBuilder()
+            .column(0, 1, 2, 3, 4, 5, 6, 7, 8, 9)
+            .column(110.0, 111.0, 112.0, 113.0, 114.0, 115.0, 116.0, null, 118.2, 119.8)
+            .build();
+         MultiBufferDataSource source = sourceFrom(data);
+         Table table = Table.readCSV(TableTest.CSV_DATA_BUFFER_SCHEMA, opts, source)) {
+      assertTablesAreEqual(expected, table);
+    }
+  }
+
+  public static byte[] arrayFrom(File f) throws IOException {
+    long len = f.length();
+    if (len > Integer.MAX_VALUE) {
+      throw new IllegalArgumentException("Sorry cannot read " + f +
+              " into an array it does not fit");
+    }
+    int remaining = (int)len;
+    byte[] ret = new byte[remaining];
+    try (java.io.FileInputStream fin = new java.io.FileInputStream(f)) {
+      int at = 0;
+      while (remaining > 0) {
+        int amount = fin.read(ret, at, remaining);
+        at += amount;
+        remaining -= amount;
+      }
+    }
+    return ret;
+  }
+
+  public static MultiBufferDataSource sourceFrom(File f) throws IOException {
+    long len = f.length();
+    byte[] tmp = new byte[(int)Math.min(32 * 1024, len)];
+    try (HostMemoryBuffer buffer = HostMemoryBuffer.allocate(len)) {
+      try (java.io.FileInputStream fin = new java.io.FileInputStream(f)) {
+        long at = 0;
+        while (at < len) {
+          int amount = fin.read(tmp);
+          buffer.setBytes(at, tmp, 0, amount);
+          at += amount;
+        }
+      }
+      return new MultiBufferDataSource(buffer);
+    }
+  }
+
+  public static MultiBufferDataSource sourceFrom(byte[] data) {
+    long len = data.length;
+    try (HostMemoryBuffer buffer = HostMemoryBuffer.allocate(len)) {
+      buffer.setBytes(0, data, 0, len);
+      return new MultiBufferDataSource(buffer);
+    }
+  }
+
+  public static MultiBufferDataSource sourceFrom(byte[][] data) {
+    HostMemoryBuffer[] buffers = new HostMemoryBuffer[data.length];
+    try {
+      for (int i = 0; i < data.length; i++) {
+        byte[] subData = data[i];
+        buffers[i] = HostMemoryBuffer.allocate(subData.length);
+        buffers[i].setBytes(0, subData, 0, subData.length);
+      }
+      return new MultiBufferDataSource(buffers);
+    } finally {
+      for (HostMemoryBuffer buffer: buffers) {
+        if (buffer != null) {
+          buffer.close();
+        }
+      }
+    }
+  }
+
+  @Test
+  void testReadCSVDataSource() {
+    CSVOptions opts = CSVOptions.builder()
+            .includeColumn("A")
+            .includeColumn("B")
+            .hasHeader()
+            .withDelim('|')
+            .withQuote('\'')
+            .withNullValue("NULL")
+            .build();
+    try (Table expected = new Table.TestBuilder()
+            .column(0, 1, 2, 3, 4, 5, 6, 7, 8, 9)
+            .column(110.0, 111.0, 112.0, 113.0, 114.0, 115.0, 116.0, null, 118.2, 119.8)
+            .build();
+         MultiBufferDataSource source = sourceFrom(TableTest.CSV_DATA_BUFFER);
+         Table table = Table.readCSV(TableTest.CSV_DATA_BUFFER_SCHEMA, opts, source)) {
+      assertTablesAreEqual(expected, table);
+    }
+  }
+
   @Test
   void testReadCSVWithOffset() {
     CSVOptions opts = CSVOptions.builder()
@@ -864,6 +1003,37 @@ void testReadParquet() {
     }
   }
 
+  @Test
+  void testReadParquetFromDataSource() throws IOException {
+    ParquetOptions opts = ParquetOptions.builder()
+            .includeColumn("loan_id")
+            .includeColumn("zip")
+            .includeColumn("num_units")
+            .build();
+    try (MultiBufferDataSource source = sourceFrom(TEST_PARQUET_FILE);
+         Table table = Table.readParquet(opts, source)) {
+      long rows = table.getRowCount();
+      assertEquals(1000, rows);
+      assertTableTypes(new DType[]{DType.INT64, DType.INT32, DType.INT32}, table);
+    }
+  }
+
+  @Test
+  void testReadParquetMultiBuffer() throws IOException {
+    ParquetOptions opts = ParquetOptions.builder()
+            .includeColumn("loan_id")
+            .includeColumn("zip")
+            .includeColumn("num_units")
+            .build();
+    byte [][] data = sliceBytes(arrayFrom(TEST_PARQUET_FILE), 10);
+    try (MultiBufferDataSource source = sourceFrom(data);
+         Table table = Table.readParquet(opts, source)) {
+      long rows = table.getRowCount();
+      assertEquals(1000, rows);
+      assertTableTypes(new DType[]{DType.INT64, DType.INT32, DType.INT32}, table);
+    }
+  }
+
   @Test
   void testReadParquetBinary() {
     ParquetOptions opts = ParquetOptions.builder()
@@ -1018,6 +1188,23 @@ void testChunkedReadParquet() {
     }
   }
 
+  @Test
+  void testChunkedReadParquetFromDataSource() throws IOException {
+    try (MultiBufferDataSource source = sourceFrom(TEST_PARQUET_FILE_CHUNKED_READ);
+         ParquetChunkedReader reader = new ParquetChunkedReader(240000, ParquetOptions.DEFAULT, source)) {
+      int numChunks = 0;
+      long totalRows = 0;
+      while(reader.hasNext()) {
+        ++numChunks;
+        try(Table chunk = reader.readChunk()) {
+          totalRows += chunk.getRowCount();
+        }
+      }
+      assertEquals(2, numChunks);
+      assertEquals(40000, totalRows);
+    }
+  }
+
   @Test
   void testReadAvro() {
     AvroOptions opts = AvroOptions.builder()
@@ -1037,6 +1224,26 @@ void testReadAvro() {
     }
   }
 
+  @Test
+  void testReadAvroFromDataSource() throws IOException {
+    AvroOptions opts = AvroOptions.builder()
+            .includeColumn("bool_col")
+            .includeColumn("int_col")
+            .includeColumn("timestamp_col")
+            .build();
+
+    try (Table expected = new Table.TestBuilder()
+            .column(true, false, true, false, true, false, true, false)
+            .column(0, 1, 0, 1, 0, 1, 0, 1)
+            .column(1235865600000000L, 1235865660000000L, 1238544000000000L, 1238544060000000L,
+                    1233446400000000L, 1233446460000000L, 1230768000000000L, 1230768060000000L)
+            .build();
+         MultiBufferDataSource source = sourceFrom(TEST_ALL_TYPES_PLAIN_AVRO_FILE);
+         Table table = Table.readAvro(opts, source)) {
+      assertTablesAreEqual(expected, table);
+    }
+  }
+
   @Test
   void testReadAvroBuffer() throws IOException{
     AvroOptions opts = AvroOptions.builder()
@@ -1094,6 +1301,24 @@ void testReadORC() {
     }
   }
 
+  @Test
+  void testReadORCFromDataSource() throws IOException {
+    ORCOptions opts = ORCOptions.builder()
+            .includeColumn("string1")
+            .includeColumn("float1")
+            .includeColumn("int1")
+            .build();
+    try (Table expected = new Table.TestBuilder()
+            .column("hi","bye")
+            .column(1.0f,2.0f)
+            .column(65536,65536)
+            .build();
+         MultiBufferDataSource source = sourceFrom(TEST_ORC_FILE);
+         Table table = Table.readORC(opts, source)) {
+      assertTablesAreEqual(expected, table);
+    }
+  }
+
   @Test
   void testReadORCBuffer() throws IOException {
     ORCOptions opts = ORCOptions.builder()
diff --git a/python/cudf/CMakeLists.txt b/python/cudf/CMakeLists.txt
index 6f3e428d291..a8b91c27095 100644
--- a/python/cudf/CMakeLists.txt
+++ b/python/cudf/CMakeLists.txt
@@ -14,7 +14,7 @@
 
 cmake_minimum_required(VERSION 3.26.4 FATAL_ERROR)
 
-set(cudf_version 23.10.00)
+set(cudf_version 23.12.00)
 
 include(../../fetch_rapids.cmake)
 include(rapids-cuda)
diff --git a/python/cudf/cudf/VERSION b/python/cudf/cudf/VERSION
new file mode 120000
index 00000000000..d62dc733efd
--- /dev/null
+++ b/python/cudf/cudf/VERSION
@@ -0,0 +1 @@
+../../../VERSION
\ No newline at end of file
diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py
index e5c78fca893..02274a5fdd1 100644
--- a/python/cudf/cudf/__init__.py
+++ b/python/cudf/cudf/__init__.py
@@ -17,6 +17,7 @@
 from rmm.allocators.numba import RMMNumbaManager
 
 from cudf import api, core, datasets, testing
+from cudf._version import __git_commit__, __version__
 from cudf.api.extensions import (
     register_dataframe_accessor,
     register_index_accessor,
@@ -99,8 +100,6 @@
 rmm.register_reinitialize_hook(clear_cache)
 
 
-__version__ = "23.10.00"
-
 __all__ = [
     "BaseIndex",
     "CategoricalDtype",
diff --git a/python/cudf/cudf/_fuzz_testing/orc.py b/python/cudf/cudf/_fuzz_testing/orc.py
index 65d2e09988f..ecddc72fa85 100644
--- a/python/cudf/cudf/_fuzz_testing/orc.py
+++ b/python/cudf/cudf/_fuzz_testing/orc.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 import copy
 import io
@@ -6,14 +6,13 @@
 import random
 
 import numpy as np
-import pyorc
+import pyarrow as pa
 
 import cudf
 from cudf._fuzz_testing.io import IOFuzz
 from cudf._fuzz_testing.utils import (
     ALL_POSSIBLE_VALUES,
     _generate_rand_meta,
-    pandas_to_orc,
     pyarrow_to_pandas,
 )
 from cudf.testing import dataset_generator as dg
@@ -82,12 +81,7 @@ def generate_input(self):
         logging.info(f"Shape of DataFrame generated: {table.shape}")
         self._df = df
         file_obj = io.BytesIO()
-        pandas_to_orc(
-            df,
-            file_io_obj=file_obj,
-            stripe_size=self._rand(len(df)),
-            arrow_table_schema=table.schema,
-        )
+        pa.orc.write_table(table, file_obj, stripe_size=self._rand(len(df)))
         file_obj.seek(0)
         buf = file_obj.read()
         self._current_buffer = copy.copy(buf)
@@ -109,8 +103,8 @@ def set_rand_params(self, params):
                     )
                 elif param == "stripes":
                     f = io.BytesIO(self._current_buffer)
-                    reader = pyorc.Reader(f)
-                    stripes = [i for i in range(reader.num_of_stripes)]
+                    orcFile = pa.orc.ORCFile(f)
+                    stripes = list(range(orcFile.nstripes))
                     params_dict[param] = np.random.choice(
                         [
                             None,
@@ -119,7 +113,7 @@ def set_rand_params(self, params):
                                     int,
                                     np.unique(
                                         np.random.choice(
-                                            stripes, reader.num_of_stripes
+                                            stripes, orcFile.nstripes
                                         )
                                     ),
                                 )
diff --git a/python/cudf/cudf/_fuzz_testing/utils.py b/python/cudf/cudf/_fuzz_testing/utils.py
index 03418e00cde..0c88c1aeacd 100644
--- a/python/cudf/cudf/_fuzz_testing/utils.py
+++ b/python/cudf/cudf/_fuzz_testing/utils.py
@@ -1,13 +1,11 @@
 # Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 import random
-from collections import OrderedDict
 
 import fastavro
 import numpy as np
 import pandas as pd
 import pyarrow as pa
-import pyorc
 
 import cudf
 from cudf.testing._utils import assert_eq
@@ -41,40 +39,6 @@
     cudf.dtype("<M8[us]"): {"type": "long", "logicalType": "timestamp-micros"},
 }
 
-PANDAS_TO_ORC_TYPES = {
-    cudf.dtype("int8"): pyorc.TinyInt(),
-    pd.Int8Dtype(): pyorc.TinyInt(),
-    pd.Int16Dtype(): pyorc.SmallInt(),
-    pd.Int32Dtype(): pyorc.Int(),
-    pd.Int64Dtype(): pyorc.BigInt(),
-    pd.Float32Dtype(): pyorc.Float(),
-    pd.Float64Dtype(): pyorc.Double(),
-    pd.BooleanDtype(): pyorc.Boolean(),
-    cudf.dtype("bool_"): pyorc.Boolean(),
-    cudf.dtype("int16"): pyorc.SmallInt(),
-    cudf.dtype("int32"): pyorc.Int(),
-    cudf.dtype("int64"): pyorc.BigInt(),
-    cudf.dtype("O"): pyorc.String(),
-    pd.StringDtype(): pyorc.String(),
-    cudf.dtype("float32"): pyorc.Float(),
-    cudf.dtype("float64"): pyorc.Double(),
-    cudf.dtype("<M8[ns]"): pyorc.Timestamp(),
-    cudf.dtype("<M8[ms]"): pyorc.Timestamp(),
-    cudf.dtype("<M8[us]"): pyorc.Timestamp(),
-}
-
-ORC_TO_PANDAS_TYPES = {
-    pyorc.TinyInt().name: pd.Int8Dtype(),
-    pyorc.Int().name: pd.Int32Dtype(),
-    pyorc.Boolean().name: pd.BooleanDtype(),
-    pyorc.SmallInt().name: pd.Int16Dtype(),
-    pyorc.BigInt().name: pd.Int64Dtype(),
-    pyorc.String().name: pd.StringDtype(),
-    pyorc.Float().name: pd.Float32Dtype(),
-    pyorc.Double().name: pd.Float64Dtype(),
-    pyorc.Timestamp().name: cudf.dtype("<M8[ns]"),
-}
-
 
 def _generate_rand_meta(obj, dtypes_list, null_frequency_override=None):
     obj._current_params = {}
@@ -213,24 +177,6 @@ def get_avro_dtype_info(dtype):
         )
 
 
-def get_orc_dtype_info(dtype):
-    if dtype in PANDAS_TO_ORC_TYPES:
-        return PANDAS_TO_ORC_TYPES[dtype]
-    else:
-        raise TypeError(
-            f"Unsupported dtype({dtype}) according to orc spec:"
-            f" https://orc.apache.org/specification/"
-        )
-
-
-def get_arrow_dtype_info_for_pyorc(dtype):
-    if isinstance(dtype, pa.StructType):
-        return get_orc_schema(df=None, arrow_table_schema=dtype)
-    else:
-        pd_dtype = cudf.dtype(dtype.to_pandas_dtype())
-        return get_orc_dtype_info(pd_dtype)
-
-
 def get_avro_schema(df):
     fields = [
         {"name": col_name, "type": get_avro_dtype_info(col_dtype)}
@@ -240,22 +186,6 @@ def get_avro_schema(df):
     return schema
 
 
-def get_orc_schema(df, arrow_table_schema=None):
-    if arrow_table_schema is None:
-        ordered_dict = OrderedDict(
-            (col_name, get_orc_dtype_info(col_dtype))
-            for col_name, col_dtype in df.dtypes.items()
-        )
-    else:
-        ordered_dict = OrderedDict(
-            (field.name, get_arrow_dtype_info_for_pyorc(field.type))
-            for field in arrow_table_schema
-        )
-
-    schema = pyorc.Struct(**ordered_dict)
-    return schema
-
-
 def convert_nulls_to_none(records, df):
     columns_with_nulls = {col for col in df.columns if df[col].isnull().any()}
     scalar_columns_convert = [
@@ -296,99 +226,19 @@ def pandas_to_avro(df, file_name=None, file_io_obj=None):
         fastavro.writer(file_io_obj, avro_schema, records)
 
 
-def _preprocess_to_orc_tuple(df, arrow_table_schema):
-    def _null_to_None(value):
-        if value is pd.NA or value is pd.NaT:
-            return None
-        else:
-            return value
-
-    def sanitize(value, struct_type):
-        if value is None:
-            return None
-
-        values_list = []
-        for name, sub_type in struct_type.fields.items():
-            if isinstance(sub_type, cudf.StructDtype):
-                values_list.append(sanitize(value[name], sub_type))
-            else:
-                values_list.append(value[name])
-        return tuple(values_list)
-
-    has_nulls_or_nullable_dtype = any(
-        (col := df[colname]).dtype in pandas_dtypes_to_np_dtypes
-        or col.isnull().any()
-        for colname in df.columns
-    )
-    pdf = df.copy(deep=True)
-    for field in arrow_table_schema:
-        if isinstance(field.type, pa.StructType):
-            pdf[field.name] = pdf[field.name].apply(
-                sanitize, args=(cudf.StructDtype.from_arrow(field.type),)
-            )
-        else:
-            pdf[field.name] = pdf[field.name]
-
-    tuple_list = [
-        tuple(map(_null_to_None, tup)) if has_nulls_or_nullable_dtype else tup
-        for tup in pdf.itertuples(index=False, name=None)
-    ]
-
-    return tuple_list, pdf, df
-
-
-def pandas_to_orc(
-    df,
-    file_name=None,
-    file_io_obj=None,
-    stripe_size=67108864,
-    arrow_table_schema=None,
-):
-    schema = get_orc_schema(df, arrow_table_schema=arrow_table_schema)
-
-    tuple_list, pdf, df = _preprocess_to_orc_tuple(
-        df, arrow_table_schema=arrow_table_schema
-    )
-
-    if file_name is not None:
-        with open(file_name, "wb") as data:
-            with pyorc.Writer(data, schema, stripe_size=stripe_size) as writer:
-                writer.writerows(tuple_list)
-    elif file_io_obj is not None:
-        with pyorc.Writer(
-            file_io_obj, schema, stripe_size=stripe_size
-        ) as writer:
-            writer.writerows(tuple_list)
-
-
 def orc_to_pandas(file_name=None, file_io_obj=None, stripes=None):
     if file_name is not None:
         f = open(file_name, "rb")
     elif file_io_obj is not None:
         f = file_io_obj
 
-    reader = pyorc.Reader(f)
-
-    dtypes = {
-        col: ORC_TO_PANDAS_TYPES[pyorc_type.name]
-        for col, pyorc_type in reader.schema.fields.items()
-    }
-
     if stripes is None:
-        df = pd.DataFrame.from_records(
-            reader, columns=reader.schema.fields.keys()
-        )
+        df = pd.read_orc(f)
     else:
-        records = [
-            record for i in stripes for record in list(reader.read_stripe(i))
-        ]
-        df = pd.DataFrame.from_records(
-            records, columns=reader.schema.fields.keys()
-        )
-
-    # Need to type-cast to extracted `dtypes` from pyorc schema because
-    # a fully empty/ full <NA> can result in incorrect dtype by pandas.
-    df = df.astype(dtypes)
+        orc_file = pa.orc.ORCFile(f)
+        records = [orc_file.read_stripe(i) for i in stripes]
+        pa_table = pa.Table.from_batches(records)
+        df = pa_table.to_pandas()
 
     return df
 
diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt
index 947659c290a..c041c7f4842 100644
--- a/python/cudf/cudf/_lib/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/CMakeLists.txt
@@ -81,12 +81,6 @@ target_link_libraries(strings_udf cudf_strings_udf)
 # necessary. The relevant command is tar -xf /opt/_internal/static-libs-for-embedding-only.tar.xz -C
 # /opt/_internal"
 find_package(NumPy REQUIRED)
-set(targets_using_numpy interop avro csv orc json parquet)
-foreach(target IN LISTS targets_using_numpy)
-  target_include_directories(${target} PRIVATE "${NumPy_INCLUDE_DIRS}")
-  # Switch to the line below when we switch back to FindPython.cmake in CMake 3.24.
-  # target_include_directories(${target} PRIVATE "${Python_NumPy_INCLUDE_DIRS}")
-endforeach()
 
 set(targets_using_dlpack interop)
 foreach(target IN LISTS targets_using_dlpack)
@@ -107,8 +101,12 @@ if(${PYARROW_RESULT})
   message(FATAL_ERROR "Error while trying to obtain pyarrow include directory:\n${PYARROW_ERROR}")
 endif()
 
-set(targets_using_arrow_headers interop avro csv orc json parquet)
-foreach(target IN LISTS targets_using_arrow_headers)
+# TODO: Due to cudf's scalar.pyx needing to cimport pylibcudf's scalar.pyx (because there are parts
+# of cudf Cython that need to directly access the c_obj underlying the pylibcudf Scalar) the
+# requirement for arrow headers infects all of cudf. That in turn requires including numpy headers.
+# These requirements will go away once all scalar-related Cython code is removed from cudf.
+foreach(target IN LISTS RAPIDS_CYTHON_CREATED_TARGETS)
+  target_include_directories(${target} PRIVATE "${NumPy_INCLUDE_DIRS}")
   target_include_directories(${target} PRIVATE "${PYARROW_INCLUDE_DIR}")
 endforeach()
 
diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx
index f751d73b142..0edf9f8aa95 100644
--- a/python/cudf/cudf/_lib/column.pyx
+++ b/python/cudf/cudf/_lib/column.pyx
@@ -24,7 +24,7 @@ from cudf.utils.dtypes import _get_base_dtype
 
 from cpython.buffer cimport PyObject_CheckBuffer
 from libc.stdint cimport uintptr_t
-from libcpp.memory cimport unique_ptr
+from libcpp.memory cimport make_unique, unique_ptr
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
@@ -47,7 +47,6 @@ from cudf._lib.cpp.column.column_factories cimport (
     make_numeric_column,
 )
 from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.libcpp.memory cimport make_unique
 from cudf._lib.cpp.null_mask cimport null_count as cpp_null_count
 from cudf._lib.cpp.scalar.scalar cimport scalar
 from cudf._lib.scalar cimport DeviceScalar
diff --git a/python/cudf/cudf/_lib/concat.pyx b/python/cudf/cudf/_lib/concat.pyx
index feaf75ef237..1ec4719631e 100644
--- a/python/cudf/cudf/_lib/concat.pyx
+++ b/python/cudf/cudf/_lib/concat.pyx
@@ -1,7 +1,7 @@
 # Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from libcpp cimport bool
-from libcpp.memory cimport unique_ptr
+from libcpp.memory cimport make_unique, unique_ptr
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
@@ -12,7 +12,6 @@ from cudf._lib.cpp.concatenate cimport (
     concatenate_masks as libcudf_concatenate_masks,
     concatenate_tables as libcudf_concatenate_tables,
 )
-from cudf._lib.cpp.libcpp.memory cimport make_unique
 from cudf._lib.cpp.table.table cimport table, table_view
 from cudf._lib.utils cimport (
     data_from_unique_ptr,
diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx
index f57bc15ed57..ea6ee76c14a 100644
--- a/python/cudf/cudf/_lib/copying.pyx
+++ b/python/cudf/cudf/_lib/copying.pyx
@@ -24,12 +24,13 @@ from cudf._lib.utils cimport table_view_from_columns, table_view_from_table
 from cudf._lib.reduce import minmax
 from cudf.core.abc import Serializable
 
+from libcpp.functional cimport reference_wrapper
+from libcpp.memory cimport make_unique
+
 cimport cudf._lib.cpp.contiguous_split as cpp_contiguous_split
 cimport cudf._lib.cpp.copying as cpp_copying
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view, mutable_column_view
-from cudf._lib.cpp.libcpp.functional cimport reference_wrapper
-from cudf._lib.cpp.libcpp.memory cimport make_unique
 from cudf._lib.cpp.lists.gather cimport (
     segmented_gather as cpp_segmented_gather,
 )
diff --git a/python/cudf/cudf/_lib/cpp/copying.pxd b/python/cudf/cudf/_lib/cpp/copying.pxd
index 20725c252fc..5637b55ac1c 100644
--- a/python/cudf/cudf/_lib/cpp/copying.pxd
+++ b/python/cudf/cudf/_lib/cpp/copying.pxd
@@ -2,6 +2,7 @@
 
 from libc.stdint cimport int32_t, int64_t, uint8_t
 from libcpp cimport bool
+from libcpp.functional cimport reference_wrapper
 from libcpp.memory cimport unique_ptr
 from libcpp.vector cimport vector
 
@@ -9,7 +10,6 @@ from rmm._lib.device_buffer cimport device_buffer
 
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view, mutable_column_view
-from cudf._lib.cpp.libcpp.functional cimport reference_wrapper
 from cudf._lib.cpp.scalar.scalar cimport scalar
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
diff --git a/python/cudf/cudf/_lib/cpp/groupby.pxd b/python/cudf/cudf/_lib/cpp/groupby.pxd
index 2ecdf76842f..0266404fc50 100644
--- a/python/cudf/cudf/_lib/cpp/groupby.pxd
+++ b/python/cudf/cudf/_lib/cpp/groupby.pxd
@@ -1,6 +1,7 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from libcpp cimport bool
+from libcpp.functional cimport reference_wrapper
 from libcpp.memory cimport unique_ptr
 from libcpp.pair cimport pair
 from libcpp.vector cimport vector
@@ -11,7 +12,6 @@ from cudf._lib.cpp.aggregation cimport (
 )
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.libcpp.functional cimport reference_wrapper
 from cudf._lib.cpp.replace cimport replace_policy
 from cudf._lib.cpp.scalar.scalar cimport scalar
 from cudf._lib.cpp.table.table cimport table
diff --git a/python/cudf/cudf/_lib/cpp/io/orc.pxd b/python/cudf/cudf/_lib/cpp/io/orc.pxd
index dd6f919a74d..d5ac8574fe4 100644
--- a/python/cudf/cudf/_lib/cpp/io/orc.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/orc.pxd
@@ -4,12 +4,12 @@ from libc.stdint cimport uint8_t
 from libcpp cimport bool
 from libcpp.map cimport map
 from libcpp.memory cimport shared_ptr, unique_ptr
+from libcpp.optional cimport optional
 from libcpp.string cimport string
 from libcpp.vector cimport vector
 
 cimport cudf._lib.cpp.io.types as cudf_io_types
 cimport cudf._lib.cpp.table.table_view as cudf_table_view
-from cudf._lib.cpp.libcpp.optional cimport optional
 from cudf._lib.cpp.types cimport data_type, size_type
 
 
diff --git a/python/cudf/cudf/_lib/cpp/io/parquet.pxd b/python/cudf/cudf/_lib/cpp/io/parquet.pxd
index 2b92b9b58d3..cdd1bde0274 100644
--- a/python/cudf/cudf/_lib/cpp/io/parquet.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/parquet.pxd
@@ -2,16 +2,16 @@
 
 from libc.stdint cimport uint8_t
 from libcpp cimport bool
+from libcpp.functional cimport reference_wrapper
 from libcpp.map cimport map
 from libcpp.memory cimport shared_ptr, unique_ptr
+from libcpp.optional cimport optional
 from libcpp.string cimport string
 from libcpp.vector cimport vector
 
 cimport cudf._lib.cpp.io.types as cudf_io_types
 cimport cudf._lib.cpp.table.table_view as cudf_table_view
 from cudf._lib.cpp.expressions cimport expression
-from cudf._lib.cpp.libcpp.functional cimport reference_wrapper
-from cudf._lib.cpp.libcpp.optional cimport optional
 from cudf._lib.cpp.types cimport data_type, size_type
 
 
@@ -90,10 +90,18 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         void set_column_chunks_file_paths(
             vector[string] column_chunks_file_paths
         ) except +
+        void set_int96_timestamps(
+            bool enabled
+        ) except +
+        void set_utc_timestamps(
+            bool enabled
+        ) except +
         void set_row_group_size_bytes(size_t val) except +
         void set_row_group_size_rows(size_type val) except +
         void set_max_page_size_bytes(size_t val) except +
         void set_max_page_size_rows(size_type val) except +
+        void enable_write_v2_headers(bool val) except +
+        void set_dictionary_policy(cudf_io_types.dictionary_policy policy)except +
 
         @staticmethod
         parquet_writer_options_builder builder(
@@ -129,6 +137,9 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         parquet_writer_options_builder& int96_timestamps(
             bool enabled
         ) except +
+        parquet_writer_options_builder& utc_timestamps(
+            bool enabled
+        ) except +
         parquet_writer_options_builder& row_group_size_bytes(
             size_t val
         ) except +
@@ -141,6 +152,12 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         parquet_writer_options_builder& max_page_size_rows(
             size_type val
         ) except +
+        parquet_writer_options_builder& write_v2_headers(
+            bool val
+        ) except +
+        parquet_writer_options_builder& dictionary_policy(
+            cudf_io_types.dictionary_policy val
+        ) except +
 
         parquet_writer_options build() except +
 
@@ -172,10 +189,18 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         void set_compression(
             cudf_io_types.compression_type compression
         ) except +
+        void set_int96_timestamps(
+            bool enabled
+        ) except +
+        void set_utc_timestamps(
+            bool enabled
+        ) except +
         void set_row_group_size_bytes(size_t val) except +
         void set_row_group_size_rows(size_type val) except +
         void set_max_page_size_bytes(size_t val) except +
         void set_max_page_size_rows(size_type val) except +
+        void enable_write_v2_headers(bool val) except +
+        void set_dictionary_policy(cudf_io_types.dictionary_policy policy)except +
 
         @staticmethod
         chunked_parquet_writer_options_builder builder(
@@ -199,6 +224,12 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         chunked_parquet_writer_options_builder& compression(
             cudf_io_types.compression_type compression
         ) except +
+        chunked_parquet_writer_options_builder& int96_timestamps(
+            bool enabled
+        ) except +
+        chunked_parquet_writer_options_builder& utc_timestamps(
+            bool enabled
+        ) except +
         chunked_parquet_writer_options_builder& row_group_size_bytes(
             size_t val
         ) except +
@@ -211,6 +242,12 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         chunked_parquet_writer_options_builder& max_page_size_rows(
             size_type val
         ) except +
+        parquet_writer_options_builder& write_v2_headers(
+            bool val
+        ) except +
+        parquet_writer_options_builder& dictionary_policy(
+            cudf_io_types.dictionary_policy val
+        ) except +
 
         chunked_parquet_writer_options build() except +
 
diff --git a/python/cudf/cudf/_lib/cpp/io/timezone.pxd b/python/cudf/cudf/_lib/cpp/io/timezone.pxd
index ba481d9a1d3..927c2118473 100644
--- a/python/cudf/cudf/_lib/cpp/io/timezone.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/timezone.pxd
@@ -2,9 +2,9 @@
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
+from libcpp.optional cimport optional
 from libcpp.string cimport string
 
-from cudf._lib.cpp.libcpp.optional cimport optional
 from cudf._lib.cpp.table.table cimport table
 
 
diff --git a/python/cudf/cudf/_lib/cpp/io/types.pxd b/python/cudf/cudf/_lib/cpp/io/types.pxd
index 01eaca82692..d8cc329b0a0 100644
--- a/python/cudf/cudf/_lib/cpp/io/types.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/types.pxd
@@ -52,6 +52,11 @@ cdef extern from "cudf/io/types.hpp" \
         STATISTICS_PAGE = 2,
         STATISTICS_COLUMN = 3,
 
+    ctypedef enum dictionary_policy:
+        NEVER = 0,
+        ADAPTIVE = 1,
+        ALWAYS = 2,
+
     cdef cppclass column_name_info:
         string name
         vector[column_name_info] children
diff --git a/python/cudf/cudf/_lib/cpp/libcpp/__init__.pxd b/python/cudf/cudf/_lib/cpp/libcpp/__init__.pxd
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/python/cudf/cudf/_lib/cpp/libcpp/__init__.py b/python/cudf/cudf/_lib/cpp/libcpp/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/python/cudf/cudf/_lib/cpp/libcpp/functional.pxd b/python/cudf/cudf/_lib/cpp/libcpp/functional.pxd
deleted file mode 100644
index f3e2d6d0878..00000000000
--- a/python/cudf/cudf/_lib/cpp/libcpp/functional.pxd
+++ /dev/null
@@ -1,7 +0,0 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
-
-
-cdef extern from "<functional>" namespace "std" nogil:
-    cdef cppclass reference_wrapper[T]:
-        reference_wrapper()
-        reference_wrapper(T)
diff --git a/python/cudf/cudf/_lib/cpp/libcpp/memory.pxd b/python/cudf/cudf/_lib/cpp/libcpp/memory.pxd
deleted file mode 100644
index 2178f1a940c..00000000000
--- a/python/cudf/cudf/_lib/cpp/libcpp/memory.pxd
+++ /dev/null
@@ -1,12 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
-
-from libcpp.memory cimport unique_ptr
-
-
-cdef extern from "<memory>" namespace "std" nogil:
-    # The Cython standard header does not have except +, so C++
-    # exceptions from make_unique are not caught and translated to
-    # Python ones. This is not perfectly ergonomic, we always have to
-    # wrap make_unique in move, but at least we can catch exceptions.
-    # See https://github.com/cython/cython/issues/5560
-    unique_ptr[T] make_unique[T](...) except +
diff --git a/python/cudf/cudf/_lib/cpp/libcpp/optional.pxd b/python/cudf/cudf/_lib/cpp/libcpp/optional.pxd
deleted file mode 100644
index a78c18f3f7a..00000000000
--- a/python/cudf/cudf/_lib/cpp/libcpp/optional.pxd
+++ /dev/null
@@ -1,50 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2023, NVIDIA CORPORATION &
-# AFFILIATES. All rights reserved.  SPDX-License-Identifier:
-# Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from libcpp cimport bool
-
-
-cdef extern from "<optional>" namespace "std" nogil:
-    cdef cppclass nullopt_t:
-        nullopt_t()
-
-    cdef nullopt_t nullopt
-
-    cdef cppclass optional[T]:
-        ctypedef T value_type
-        optional()
-        optional(nullopt_t)
-        optional(optional&) except +
-        optional(T&) except +
-        bool has_value()
-        T& value()
-        T& value_or[U](U& default_value)
-        void swap(optional&)
-        void reset()
-        T& emplace(...)
-        T& operator*()
-        optional& operator=(optional&)
-        optional& operator=[U](U&)
-        bool operator bool()
-        bool operator!()
-        bool operator==[U](optional&, U&)
-        bool operator!=[U](optional&, U&)
-        bool operator<[U](optional&, U&)
-        bool operator>[U](optional&, U&)
-        bool operator<=[U](optional&, U&)
-        bool operator>=[U](optional&, U&)
-
-    optional[T] make_optional[T](...) except +
diff --git a/python/cudf/cudf/_lib/cpp/nvtext/byte_pair_encode.pxd b/python/cudf/cudf/_lib/cpp/nvtext/byte_pair_encode.pxd
new file mode 100644
index 00000000000..e678e4e84db
--- /dev/null
+++ b/python/cudf/cudf/_lib/cpp/nvtext/byte_pair_encode.pxd
@@ -0,0 +1,24 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.string cimport string
+
+from cudf._lib.cpp.column.column cimport column
+from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.cpp.scalar.scalar cimport string_scalar
+
+
+cdef extern from "nvtext/byte_pair_encoding.hpp" namespace "nvtext" nogil:
+
+    cdef struct bpe_merge_pairs "nvtext::bpe_merge_pairs":
+        pass
+
+    cdef unique_ptr[bpe_merge_pairs] load_merge_pairs(
+        const column_view &merge_pairs
+    ) except +
+
+    cdef unique_ptr[column] byte_pair_encoding(
+        const column_view &strings,
+        const bpe_merge_pairs &merge_pairs,
+        const string_scalar &separator
+    ) except +
diff --git a/python/cudf/cudf/_lib/cpp/strings/json.pxd b/python/cudf/cudf/_lib/cpp/strings/json.pxd
index a017e1c5382..eed627c96b5 100644
--- a/python/cudf/cudf/_lib/cpp/strings/json.pxd
+++ b/python/cudf/cudf/_lib/cpp/strings/json.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
@@ -9,7 +9,7 @@ from cudf._lib.cpp.column.column_view cimport column_view
 from cudf._lib.cpp.scalar.scalar cimport scalar, string_scalar
 
 
-cdef extern from "cudf/strings/json.hpp" namespace "cudf::strings" nogil:
+cdef extern from "cudf/json/json.hpp" namespace "cudf" nogil:
     cdef cppclass get_json_object_options:
         get_json_object_options() except +
         # getters
diff --git a/python/cudf/cudf/_lib/datetime.pyx b/python/cudf/cudf/_lib/datetime.pyx
index 81949dbaa20..3d96f59c4d6 100644
--- a/python/cudf/cudf/_lib/datetime.pyx
+++ b/python/cudf/cudf/_lib/datetime.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from cudf.core.buffer import acquire_spill_lock
 
@@ -10,6 +10,7 @@ from cudf._lib.column cimport Column
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
 from cudf._lib.cpp.filling cimport calendrical_month_sequence
+from cudf._lib.cpp.scalar.scalar cimport scalar
 from cudf._lib.cpp.types cimport size_type
 from cudf._lib.scalar cimport DeviceScalar
 
@@ -166,10 +167,11 @@ def date_range(DeviceScalar start, size_type n, offset):
         + offset.kwds.get("months", 0)
     )
 
+    cdef const scalar* c_start = start.c_value.get()
     with nogil:
         c_result = move(calendrical_month_sequence(
             n,
-            start.c_value.get()[0],
+            c_start[0],
             months
         ))
     return Column.from_unique_ptr(move(c_result))
diff --git a/python/cudf/cudf/_lib/expressions.pyx b/python/cudf/cudf/_lib/expressions.pyx
index 8d7545ffe15..01a080f635f 100644
--- a/python/cudf/cudf/_lib/expressions.pyx
+++ b/python/cudf/cudf/_lib/expressions.pyx
@@ -4,12 +4,11 @@ from enum import Enum
 
 from cython.operator cimport dereference
 from libc.stdint cimport int64_t
-from libcpp.memory cimport unique_ptr
+from libcpp.memory cimport make_unique, unique_ptr
 from libcpp.string cimport string
 from libcpp.utility cimport move
 
 from cudf._lib.cpp cimport expressions as libcudf_exp
-from cudf._lib.cpp.libcpp.memory cimport make_unique
 from cudf._lib.cpp.types cimport size_type
 
 # Necessary for proper casting, see below.
diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx
index a26d820de6f..b3778e45cde 100644
--- a/python/cudf/cudf/_lib/groupby.pyx
+++ b/python/cudf/cudf/_lib/groupby.pyx
@@ -24,6 +24,8 @@ from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns
 
 from cudf._lib.scalar import as_device_scalar
 
+from libcpp.functional cimport reference_wrapper
+
 cimport cudf._lib.cpp.groupby as libcudf_groupby
 cimport cudf._lib.cpp.types as libcudf_types
 from cudf._lib.aggregation cimport (
@@ -33,7 +35,6 @@ from cudf._lib.aggregation cimport (
     make_groupby_scan_aggregation,
 )
 from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.libcpp.functional cimport reference_wrapper
 from cudf._lib.cpp.replace cimport replace_policy
 from cudf._lib.cpp.scalar.scalar cimport scalar
 from cudf._lib.cpp.table.table cimport table, table_view
diff --git a/python/cudf/cudf/_lib/interop.pyx b/python/cudf/cudf/_lib/interop.pyx
index 639754fc54f..8fd2a409d90 100644
--- a/python/cudf/cudf/_lib/interop.pyx
+++ b/python/cudf/cudf/_lib/interop.pyx
@@ -4,14 +4,7 @@ from cpython cimport pycapsule
 from libcpp.memory cimport shared_ptr, unique_ptr
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
-from pyarrow.lib cimport (
-    CScalar,
-    CTable,
-    pyarrow_unwrap_scalar,
-    pyarrow_unwrap_table,
-    pyarrow_wrap_scalar,
-    pyarrow_wrap_table,
-)
+from pyarrow.lib cimport CTable, pyarrow_unwrap_table, pyarrow_wrap_table
 
 from cudf._lib.cpp.interop cimport (
     DLManagedTensor,
@@ -21,22 +14,12 @@ from cudf._lib.cpp.interop cimport (
     to_arrow as cpp_to_arrow,
     to_dlpack as cpp_to_dlpack,
 )
-from cudf._lib.cpp.scalar.scalar cimport fixed_point_scalar, scalar
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.cpp.types cimport type_id
-from cudf._lib.cpp.wrappers.decimals cimport (
-    decimal32,
-    decimal64,
-    decimal128,
-    scale_type,
-)
-from cudf._lib.scalar cimport DeviceScalar
 from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns
 
 from cudf.api.types import is_list_dtype, is_struct_dtype
 from cudf.core.buffer import acquire_spill_lock
-from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype
 
 
 def from_dlpack(dlpack_capsule):
@@ -199,79 +182,3 @@ def from_arrow(object input_table):
         c_result = move(cpp_from_arrow(cpp_arrow_table.get()[0]))
 
     return columns_from_unique_ptr(move(c_result))
-
-
-@acquire_spill_lock()
-def to_arrow_scalar(DeviceScalar source_scalar):
-    """Convert a scalar to a PyArrow scalar.
-
-    Parameters
-    ----------
-    source_scalar : the scalar to convert
-
-    Returns
-    -------
-    pyarrow.lib.Scalar
-    """
-    cdef vector[column_metadata] cpp_metadata = gather_metadata(
-        [("", source_scalar.dtype)]
-    )
-    cdef const scalar* source_scalar_ptr = source_scalar.get_raw_ptr()
-
-    cdef shared_ptr[CScalar] cpp_arrow_scalar
-    with nogil:
-        cpp_arrow_scalar = cpp_to_arrow(
-            source_scalar_ptr[0], cpp_metadata[0]
-        )
-
-    return pyarrow_wrap_scalar(cpp_arrow_scalar)
-
-
-@acquire_spill_lock()
-def from_arrow_scalar(object input_scalar, output_dtype=None):
-    """Convert from PyArrow scalar to a cudf scalar.
-
-    Parameters
-    ----------
-    input_scalar : PyArrow scalar
-    output_dtype : output type to cast to, ignored except for decimals
-
-    Returns
-    -------
-    cudf._lib.DeviceScalar
-    """
-    cdef shared_ptr[CScalar] cpp_arrow_scalar = (
-        pyarrow_unwrap_scalar(input_scalar)
-    )
-    cdef unique_ptr[scalar] c_result
-
-    with nogil:
-        c_result = move(cpp_from_arrow(cpp_arrow_scalar.get()[0]))
-
-    cdef type_id ctype = c_result.get().type().id()
-    if ctype == type_id.DECIMAL128:
-        if output_dtype is None:
-            # Decimals must be cast to the cudf dtype of the right width
-            raise ValueError(
-                "Decimal scalars must be constructed with a dtype"
-            )
-
-        if isinstance(output_dtype, Decimal32Dtype):
-            c_result.reset(
-                new fixed_point_scalar[decimal32](
-                    (<fixed_point_scalar[decimal128]*> c_result.get()).value(),
-                    scale_type(-input_scalar.type.scale),
-                    c_result.get().is_valid()
-                )
-            )
-        elif isinstance(output_dtype, Decimal64Dtype):
-            c_result.reset(
-                new fixed_point_scalar[decimal64](
-                    (<fixed_point_scalar[decimal128]*> c_result.get()).value(),
-                    scale_type(-input_scalar.type.scale),
-                    c_result.get().is_valid()
-                )
-            )
-        # Decimal128Dtype is a no-op, no conversion needed.
-
-    return DeviceScalar.from_unique_ptr(move(c_result), output_dtype)
diff --git a/python/cudf/cudf/_lib/join.pyx b/python/cudf/cudf/_lib/join.pyx
index 416680aae24..378be978cc0 100644
--- a/python/cudf/cudf/_lib/join.pyx
+++ b/python/cudf/cudf/_lib/join.pyx
@@ -2,7 +2,7 @@
 
 from cudf.core.buffer import acquire_spill_lock
 
-from libcpp.memory cimport unique_ptr
+from libcpp.memory cimport make_unique, unique_ptr
 from libcpp.pair cimport pair
 from libcpp.utility cimport move
 
@@ -11,7 +11,6 @@ from rmm._lib.device_buffer cimport device_buffer
 cimport cudf._lib.cpp.join as cpp_join
 from cudf._lib.column cimport Column
 from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.libcpp.memory cimport make_unique
 from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.cpp.types cimport data_type, size_type, type_id
 from cudf._lib.utils cimport table_view_from_columns
diff --git a/python/cudf/cudf/_lib/null_mask.pyx b/python/cudf/cudf/_lib/null_mask.pyx
index 5b4538629f6..1f98140d9e4 100644
--- a/python/cudf/cudf/_lib/null_mask.pyx
+++ b/python/cudf/cudf/_lib/null_mask.pyx
@@ -6,13 +6,12 @@ from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer
 
 from cudf.core.buffer import acquire_spill_lock, as_buffer
 
-from libcpp.memory cimport unique_ptr
+from libcpp.memory cimport make_unique, unique_ptr
 from libcpp.pair cimport pair
 from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
 from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.libcpp.memory cimport make_unique
 from cudf._lib.cpp.null_mask cimport (
     bitmask_allocation_size_bytes as cpp_bitmask_allocation_size_bytes,
     bitmask_and as cpp_bitmask_and,
diff --git a/python/cudf/cudf/_lib/nvtext/CMakeLists.txt b/python/cudf/cudf/_lib/nvtext/CMakeLists.txt
index 515b9c1d6e4..d7cbdeb5bda 100644
--- a/python/cudf/cudf/_lib/nvtext/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/nvtext/CMakeLists.txt
@@ -13,8 +13,8 @@
 # =============================================================================
 
 set(cython_sources
-    edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx ngrams_tokenize.pyx normalize.pyx
-    replace.pyx stemmer.pyx subword_tokenize.pyx tokenize.pyx
+    byte_pair_encode.pyx edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx
+    ngrams_tokenize.pyx normalize.pyx replace.pyx stemmer.pyx subword_tokenize.pyx tokenize.pyx
 )
 set(linked_libraries cudf::cudf)
 rapids_cython_create_modules(
@@ -22,3 +22,11 @@ rapids_cython_create_modules(
   SOURCE_FILES "${cython_sources}"
   LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX nvtext_ ASSOCIATED_TARGETS cudf
 )
+# TODO: Due to cudf's scalar.pyx needing to cimport pylibcudf's scalar.pyx (because there are parts
+# of cudf Cython that need to directly access the c_obj underlying the pylibcudf Scalar) the
+# requirement for arrow headers infects all of cudf. That in turn requires including numpy headers.
+# These requirements will go away once all scalar-related Cython code is removed from cudf.
+foreach(target IN LISTS RAPIDS_CYTHON_CREATED_TARGETS)
+  target_include_directories(${target} PRIVATE "${NumPy_INCLUDE_DIRS}")
+  target_include_directories(${target} PRIVATE "${PYARROW_INCLUDE_DIR}")
+endforeach()
diff --git a/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx b/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx
new file mode 100644
index 00000000000..cfc76afa8a5
--- /dev/null
+++ b/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx
@@ -0,0 +1,50 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+
+from cudf.core.buffer import acquire_spill_lock
+
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+
+from cudf._lib.column cimport Column
+from cudf._lib.cpp.column.column cimport column
+from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.cpp.nvtext.byte_pair_encode cimport (
+    bpe_merge_pairs as cpp_bpe_merge_pairs,
+    byte_pair_encoding as cpp_byte_pair_encoding,
+    load_merge_pairs as cpp_load_merge_pairs,
+)
+from cudf._lib.cpp.scalar.scalar cimport string_scalar
+from cudf._lib.scalar cimport DeviceScalar
+
+
+cdef class BPEMergePairs:
+    cdef unique_ptr[cpp_bpe_merge_pairs] c_obj
+
+    def __cinit__(self, Column merge_pairs):
+        cdef column_view c_pairs = merge_pairs.view()
+        with nogil:
+            self.c_obj = move(cpp_load_merge_pairs(c_pairs))
+
+
+@acquire_spill_lock()
+def byte_pair_encoding(
+    Column strings,
+    BPEMergePairs merge_pairs,
+    object separator
+):
+    cdef column_view c_strings = strings.view()
+    cdef DeviceScalar d_separator = separator.device_value
+    cdef const string_scalar* c_separator = <const string_scalar*>d_separator\
+        .get_raw_ptr()
+    cdef unique_ptr[column] c_result
+    with nogil:
+        c_result = move(
+            cpp_byte_pair_encoding(
+                c_strings,
+                merge_pairs.c_obj.get()[0],
+                c_separator[0]
+            )
+        )
+
+    return Column.from_unique_ptr(move(c_result))
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index 85fd25cf1a9..4acb1ce10b1 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -32,7 +32,7 @@ from cudf._lib.utils import _index_level_name, generate_pandas_metadata
 from libc.stdint cimport uint8_t
 from libcpp cimport bool
 from libcpp.map cimport map
-from libcpp.memory cimport unique_ptr
+from libcpp.memory cimport make_unique, unique_ptr
 from libcpp.string cimport string
 from libcpp.unordered_map cimport unordered_map
 from libcpp.utility cimport move
@@ -52,7 +52,6 @@ from cudf._lib.cpp.io.parquet cimport (
     write_parquet as parquet_writer,
 )
 from cudf._lib.cpp.io.types cimport column_in_metadata, table_input_metadata
-from cudf._lib.cpp.libcpp.memory cimport make_unique
 from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.cpp.types cimport data_type, size_type
 from cudf._lib.io.datasource cimport NativeFileDatasource
@@ -321,6 +320,8 @@ def write_parquet(
     object max_page_size_rows=None,
     object partitions_info=None,
     object force_nullable_schema=False,
+    header_version="1.0",
+    use_dictionary=True,
 ):
     """
     Cython function to call into libcudf API, see `write_parquet`.
@@ -383,6 +384,18 @@ def write_parquet(
         tmp_user_data[str.encode("pandas")] = str.encode(pandas_metadata)
         user_data.push_back(tmp_user_data)
 
+    if header_version not in ("1.0", "2.0"):
+        raise ValueError(
+            f"Invalid parquet header version: {header_version}. "
+            "Valid values are '1.0' and '2.0'"
+        )
+
+    dict_policy = (
+        cudf_io_types.dictionary_policy.ALWAYS
+        if use_dictionary
+        else cudf_io_types.dictionary_policy.NEVER
+    )
+
     cdef cudf_io_types.compression_type comp_type = _get_comp_type(compression)
     cdef cudf_io_types.statistics_freq stat_freq = _get_stat_freq(statistics)
 
@@ -399,6 +412,9 @@ def write_parquet(
         .compression(comp_type)
         .stats_level(stat_freq)
         .int96_timestamps(_int96_timestamps)
+        .write_v2_headers(header_version == "2.0")
+        .dictionary_policy(dict_policy)
+        .utc_timestamps(False)
         .build()
     )
     if partitions_info is not None:
diff --git a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
index 0ce42dc43ff..5185b2d4bb5 100644
--- a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
@@ -12,10 +12,33 @@
 # the License.
 # =============================================================================
 
-set(cython_sources column.pyx copying.pyx gpumemoryview.pyx table.pyx types.pyx utils.pyx)
+set(cython_sources column.pyx copying.pyx gpumemoryview.pyx interop.pyx scalar.pyx table.pyx
+                   types.pyx utils.pyx
+)
 set(linked_libraries cudf::cudf)
 rapids_cython_create_modules(
   CXX
   SOURCE_FILES "${cython_sources}"
   LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_ ASSOCIATED_TARGETS cudf
 )
+
+find_package(Python 3.9 REQUIRED COMPONENTS Interpreter)
+
+execute_process(
+  COMMAND "${Python_EXECUTABLE}" -c "import pyarrow; print(pyarrow.get_include())"
+  OUTPUT_VARIABLE PYARROW_INCLUDE_DIR
+  OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+
+foreach(target IN LISTS RAPIDS_CYTHON_CREATED_TARGETS)
+  target_include_directories(${target} PRIVATE "${PYARROW_INCLUDE_DIR}")
+endforeach()
+
+# TODO: Clean up this include when switching to scikit-build-core. See cudf/_lib/CMakeLists.txt for
+# more info
+find_package(NumPy REQUIRED)
+foreach(target IN LISTS RAPIDS_CYTHON_CREATED_TARGETS)
+  target_include_directories(${target} PRIVATE "${NumPy_INCLUDE_DIRS}")
+  # Switch to the line below when we switch back to FindPython.cmake in CMake 3.24.
+  # target_include_directories(${target} PRIVATE "${Python_NumPy_INCLUDE_DIRS}")
+endforeach()
diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
index ba7822b0a54..7a35854392c 100644
--- a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
@@ -1,9 +1,10 @@
 # Copyright (c) 2023, NVIDIA CORPORATION.
 
 # TODO: Verify consistent usage of relative/absolute imports in pylibcudf.
-from . cimport copying
+from . cimport copying, interop
 from .column cimport Column
 from .gpumemoryview cimport gpumemoryview
+from .scalar cimport Scalar
 from .table cimport Table
 # TODO: cimport type_id once
 # https://github.com/cython/cython/issues/5609 is resolved
@@ -12,7 +13,9 @@ from .types cimport DataType
 __all__ = [
     "Column",
     "DataType",
+    "Scalar",
     "Table",
     "copying",
     "gpumemoryview",
+    "interop",
 ]
diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.py b/python/cudf/cudf/_lib/pylibcudf/__init__.py
index 3edff9a53e8..72b74a57b87 100644
--- a/python/cudf/cudf/_lib/pylibcudf/__init__.py
+++ b/python/cudf/cudf/_lib/pylibcudf/__init__.py
@@ -1,16 +1,19 @@
 # Copyright (c) 2023, NVIDIA CORPORATION.
 
-from . import copying
+from . import copying, interop
 from .column import Column
 from .gpumemoryview import gpumemoryview
+from .scalar import Scalar
 from .table import Table
 from .types import DataType, TypeId
 
 __all__ = [
     "Column",
     "DataType",
+    "Scalar",
     "Table",
     "TypeId",
     "copying",
     "gpumemoryview",
+    "interop",
 ]
diff --git a/python/cudf/cudf/_lib/pylibcudf/interop.pxd b/python/cudf/cudf/_lib/pylibcudf/interop.pxd
new file mode 100644
index 00000000000..3a79e5425d4
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/interop.pxd
@@ -0,0 +1,9 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+from cudf._lib.cpp.interop cimport column_metadata
+
+
+cdef class ColumnMetadata:
+    cdef public object name
+    cdef public object children_meta
+    cdef column_metadata to_libcudf(self)
diff --git a/python/cudf/cudf/_lib/pylibcudf/interop.pyx b/python/cudf/cudf/_lib/pylibcudf/interop.pyx
new file mode 100644
index 00000000000..0cdca275027
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/interop.pyx
@@ -0,0 +1,23 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+from cudf._lib.cpp.interop cimport column_metadata
+
+
+cdef class ColumnMetadata:
+    def __init__(self, name):
+        self.name = name
+        self.children_meta = []
+
+    cdef column_metadata to_libcudf(self):
+        """Convert to C++ column_metadata.
+
+        Since this class is mutable and cheap, it is easier to create the C++
+        object on the fly rather than have it directly backing the storage for
+        the Cython class.
+        """
+        cdef column_metadata c_metadata
+        cdef ColumnMetadata child_meta
+        c_metadata.name = self.name.encode()
+        for child_meta in self.children_meta:
+            c_metadata.children_meta.push_back(child_meta.to_libcudf())
+        return c_metadata
diff --git a/python/cudf/cudf/_lib/pylibcudf/scalar.pxd b/python/cudf/cudf/_lib/pylibcudf/scalar.pxd
new file mode 100644
index 00000000000..09d853d832f
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/scalar.pxd
@@ -0,0 +1,32 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+from libcpp cimport bool
+from libcpp.memory cimport unique_ptr
+from pyarrow cimport lib as pa
+
+from rmm._lib.memory_resource cimport DeviceMemoryResource
+
+from cudf._lib.cpp.scalar.scalar cimport scalar
+
+from .interop cimport ColumnMetadata
+from .types cimport DataType
+
+
+cdef class Scalar:
+    cdef unique_ptr[scalar] c_obj
+    cdef DataType _data_type
+
+    # Holds a reference to the DeviceMemoryResource used for allocation.
+    # Ensures the MR does not get destroyed before this DeviceBuffer. `mr` is
+    # needed for deallocation
+    cdef DeviceMemoryResource mr
+
+    cdef const scalar* get(self) except *
+
+    cpdef DataType type(self)
+    cpdef bool is_valid(self)
+
+    @staticmethod
+    cdef Scalar from_libcudf(unique_ptr[scalar] libcudf_scalar, dtype=*)
+
+    cpdef pa.Scalar to_arrow(self, ColumnMetadata metadata)
diff --git a/python/cudf/cudf/_lib/pylibcudf/scalar.pyx b/python/cudf/cudf/_lib/pylibcudf/scalar.pyx
new file mode 100644
index 00000000000..04f588bd3e6
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/scalar.pyx
@@ -0,0 +1,133 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+from cython cimport no_gc_clear
+from cython.operator cimport dereference
+from libcpp.memory cimport shared_ptr, unique_ptr
+from libcpp.utility cimport move
+from pyarrow cimport lib as pa
+
+from rmm._lib.memory_resource cimport get_current_device_resource
+
+from cudf._lib.cpp.interop cimport (
+    column_metadata,
+    from_arrow as cpp_from_arrow,
+    to_arrow as cpp_to_arrow,
+)
+from cudf._lib.cpp.scalar.scalar cimport fixed_point_scalar, scalar
+from cudf._lib.cpp.wrappers.decimals cimport (
+    decimal32,
+    decimal64,
+    decimal128,
+    scale_type,
+)
+
+from .interop cimport ColumnMetadata
+from .types cimport DataType, type_id
+
+
+# The DeviceMemoryResource attribute could be released prematurely
+# by the gc if the Scalar is in a reference cycle. Removing the tp_clear
+# function with the no_gc_clear decoration prevents that. See
+# https://github.com/rapidsai/rmm/pull/931 for details.
+@no_gc_clear
+cdef class Scalar:
+    """A scalar value in device memory."""
+    # Unlike for columns, libcudf does not support scalar views. All APIs that
+    # accept scalar values accept references to the owning object rather than a
+    # special view type. As a result, pylibcudf.Scalar has a simpler structure
+    # than pylibcudf.Column because it can be a true wrapper around a libcudf
+    # column
+
+    def __cinit__(self, *args, **kwargs):
+        self.mr = get_current_device_resource()
+
+    def __init__(self, pa.Scalar value=None):
+        # TODO: This case is not something we really want to
+        # support, but it here for now to ease the transition of
+        # DeviceScalar.
+        if value is not None:
+            raise ValueError("Scalar should be constructed with a factory")
+
+    @staticmethod
+    def from_arrow(pa.Scalar value, DataType data_type=None):
+        # Allow passing a dtype, but only for the purpose of decimals for now
+
+        cdef shared_ptr[pa.CScalar] cscalar = (
+            pa.pyarrow_unwrap_scalar(value)
+        )
+        cdef unique_ptr[scalar] c_result
+
+        with nogil:
+            c_result = move(cpp_from_arrow(cscalar.get()[0]))
+
+        cdef Scalar s = Scalar.from_libcudf(move(c_result))
+
+        if s.type().id() != type_id.DECIMAL128:
+            if data_type is not None:
+                raise ValueError(
+                    "dtype may not be passed for non-decimal types"
+                )
+            return s
+
+        if data_type is None:
+            raise ValueError(
+                "Decimal scalars must be constructed with a dtype"
+            )
+
+        cdef type_id tid = data_type.id()
+
+        if tid == type_id.DECIMAL32:
+            s.c_obj.reset(
+                new fixed_point_scalar[decimal32](
+                    (<fixed_point_scalar[decimal128]*> s.c_obj.get()).value(),
+                    scale_type(-value.type.scale),
+                    s.c_obj.get().is_valid()
+                )
+            )
+        elif tid == type_id.DECIMAL64:
+            s.c_obj.reset(
+                new fixed_point_scalar[decimal64](
+                    (<fixed_point_scalar[decimal128]*> s.c_obj.get()).value(),
+                    scale_type(-value.type.scale),
+                    s.c_obj.get().is_valid()
+                )
+            )
+        elif tid != type_id.DECIMAL128:
+            raise ValueError(
+                "Decimal scalars may only be cast to decimals"
+            )
+
+        return s
+
+    cpdef pa.Scalar to_arrow(self, ColumnMetadata metadata):
+        cdef shared_ptr[pa.CScalar] c_result
+        cdef column_metadata c_metadata = metadata.to_libcudf()
+
+        with nogil:
+            c_result = move(cpp_to_arrow(dereference(self.c_obj.get()), c_metadata))
+
+        return pa.pyarrow_wrap_scalar(c_result)
+
+    cdef const scalar* get(self) except *:
+        return self.c_obj.get()
+
+    cpdef DataType type(self):
+        """The type of data in the column."""
+        return self._data_type
+
+    cpdef bool is_valid(self):
+        """True if the scalar is valid, false if not"""
+        return self.get().is_valid()
+
+    @staticmethod
+    cdef Scalar from_libcudf(unique_ptr[scalar] libcudf_scalar, dtype=None):
+        """Construct a Scalar object from a libcudf scalar.
+
+        This method is for pylibcudf's functions to use to ingest outputs of
+        calling libcudf algorithms, and should generally not be needed by users
+        (even direct pylibcudf Cython users).
+        """
+        cdef Scalar s = Scalar.__new__(Scalar)
+        s.c_obj.swap(libcudf_scalar)
+        s._data_type = DataType.from_libcudf(s.get().type())
+        return s
diff --git a/python/cudf/cudf/_lib/pylibcudf/table.pxd b/python/cudf/cudf/_lib/pylibcudf/table.pxd
index 95f197b13eb..a9e2874232a 100644
--- a/python/cudf/cudf/_lib/pylibcudf/table.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/table.pxd
@@ -1,6 +1,7 @@
 # Copyright (c) 2023, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
+from pyarrow cimport lib as pa
 
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
@@ -16,3 +17,5 @@ cdef class Table:
     cdef Table from_libcudf(unique_ptr[table] libcudf_tbl)
 
     cpdef list columns(self)
+
+    cpdef pa.Table to_arrow(self, list metadata)
diff --git a/python/cudf/cudf/_lib/pylibcudf/table.pyx b/python/cudf/cudf/_lib/pylibcudf/table.pyx
index 720f9815bd6..c41eb82e4a1 100644
--- a/python/cudf/cudf/_lib/pylibcudf/table.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/table.pyx
@@ -1,15 +1,22 @@
 # Copyright (c) 2023, NVIDIA CORPORATION.
 
 from cython.operator cimport dereference
-from libcpp.memory cimport unique_ptr
+from libcpp.memory cimport shared_ptr, unique_ptr
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
+from pyarrow cimport lib as pa
 
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.cpp.interop cimport (
+    column_metadata,
+    from_arrow as cpp_from_arrow,
+    to_arrow as cpp_to_arrow,
+)
 from cudf._lib.cpp.table.table cimport table
 
 from .column cimport Column
+from .interop cimport ColumnMetadata
 
 
 cdef class Table:
@@ -60,3 +67,27 @@ cdef class Table:
 
     cpdef list columns(self):
         return self._columns
+
+    @staticmethod
+    def from_arrow(pa.Table pyarrow_table):
+        cdef shared_ptr[pa.CTable] ctable = (
+            pa.pyarrow_unwrap_table(pyarrow_table)
+        )
+        cdef unique_ptr[table] c_result
+
+        with nogil:
+            c_result = move(cpp_from_arrow(ctable.get()[0]))
+
+        return Table.from_libcudf(move(c_result))
+
+    cpdef pa.Table to_arrow(self, list metadata):
+        cdef shared_ptr[pa.CTable] c_result
+        cdef vector[column_metadata] c_metadata
+        cdef ColumnMetadata meta
+        for meta in metadata:
+            c_metadata.push_back(meta.to_libcudf())
+
+        with nogil:
+            c_result = move(cpp_to_arrow(self.view(), c_metadata))
+
+        return pa.pyarrow_wrap_table(c_result)
diff --git a/python/cudf/cudf/_lib/scalar.pxd b/python/cudf/cudf/_lib/scalar.pxd
index 1deed60d67d..77733f59c3d 100644
--- a/python/cudf/cudf/_lib/scalar.pxd
+++ b/python/cudf/cudf/_lib/scalar.pxd
@@ -1,20 +1,19 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 
 from rmm._lib.memory_resource cimport DeviceMemoryResource
 
+# TODO: Would like to remove this cimport, but it will require some more work
+# to excise all C code in scalar.pyx that relies on using the C API of the
+# pylibcudf Scalar underlying the DeviceScalar.
+from cudf._lib cimport pylibcudf
 from cudf._lib.cpp.scalar.scalar cimport scalar
 
 
 cdef class DeviceScalar:
-    cdef unique_ptr[scalar] c_value
-
-    # Holds a reference to the DeviceMemoryResource used for allocation.
-    # Ensures the MR does not get destroyed before this DeviceBuffer. `mr` is
-    # needed for deallocation
-    cdef DeviceMemoryResource mr
+    cdef pylibcudf.Scalar c_value
 
     cdef object _dtype
 
diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx
index 5ab286c5701..0b64c75f7b6 100644
--- a/python/cudf/cudf/_lib/scalar.pyx
+++ b/python/cudf/cudf/_lib/scalar.pyx
@@ -1,7 +1,5 @@
 # Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
-cimport cython
-
 import copy
 
 import numpy as np
@@ -13,17 +11,17 @@ from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
-from rmm._lib.memory_resource cimport get_current_device_resource
-
 import cudf
+from cudf._lib import pylibcudf
 from cudf._lib.types import LIBCUDF_TO_SUPPORTED_NUMPY_TYPES
-from cudf.core.dtypes import ListDtype, StructDtype
+from cudf.core.dtypes import (
+    ListDtype,
+    StructDtype,
+    is_list_dtype,
+    is_struct_dtype,
+)
 from cudf.core.missing import NA, NaT
 
-from cudf._lib.types cimport dtype_from_column_view, underlying_type_t_type_id
-
-from cudf._lib.interop import from_arrow_scalar, to_arrow_scalar
-
 cimport cudf._lib.cpp.types as libcudf_types
 from cudf._lib.cpp.scalar.scalar cimport (
     duration_scalar,
@@ -44,6 +42,7 @@ from cudf._lib.cpp.wrappers.timestamps cimport (
     timestamp_s,
     timestamp_us,
 )
+from cudf._lib.types cimport dtype_from_column_view, underlying_type_t_type_id
 
 
 def _replace_nested(obj, check, replacement):
@@ -61,15 +60,44 @@ def _replace_nested(obj, check, replacement):
                 _replace_nested(v, check, replacement)
 
 
-# The DeviceMemoryResource attribute could be released prematurely
-# by the gc if the DeviceScalar is in a reference cycle. Removing
-# the tp_clear function with the no_gc_clear decoration prevents that.
-# See https://github.com/rapidsai/rmm/pull/931 for details.
-@cython.no_gc_clear
+def gather_metadata(dtypes):
+    """Convert a dict of dtypes to a list of ColumnMetadata objects.
+
+    The metadata is constructed recursively so that nested types are
+    represented as nested ColumnMetadata objects.
+
+    Parameters
+    ----------
+    dtypes : dict
+        A dict mapping column names to dtypes.
+
+    Returns
+    -------
+    List[ColumnMetadata]
+        A list of ColumnMetadata objects.
+    """
+    out = []
+    for name, dtype in dtypes.items():
+        v = pylibcudf.interop.ColumnMetadata(name)
+        if is_struct_dtype(dtype):
+            v.children_meta = gather_metadata(dtype.fields)
+        elif is_list_dtype(dtype):
+            # Offsets column is unnamed and has no children
+            v.children_meta.append(pylibcudf.interop.ColumnMetadata(""))
+            v.children_meta.extend(
+                gather_metadata({"": dtype.element_type})
+            )
+        out.append(v)
+    return out
+
+
 cdef class DeviceScalar:
 
+    # TODO: I think this should be removable, except that currently the way
+    # that from_unique_ptr is implemented is probably dereferencing this in an
+    # invalid state. See what the best way to fix that is.
     def __cinit__(self, *args, **kwargs):
-        self.mr = get_current_device_resource()
+        self.c_value = pylibcudf.Scalar()
 
     def __init__(self, value, dtype):
         """
@@ -85,7 +113,7 @@ cdef class DeviceScalar:
         dtype : dtype
             A NumPy dtype.
         """
-        self._dtype = dtype if dtype.kind != 'U' else cudf.dtype('object')
+        dtype = dtype if dtype.kind != 'U' else cudf.dtype('object')
 
         if cudf.utils.utils.is_na_like(value):
             value = None
@@ -108,10 +136,17 @@ cdef class DeviceScalar:
 
         pa_scalar = pa.scalar(value, type=pa_type)
 
-        # Note: This factory-like behavior in __init__ will be removed when
-        # migrating to pylibcudf.
-        cdef DeviceScalar obj = from_arrow_scalar(pa_scalar, self._dtype)
-        self.c_value.swap(obj.c_value)
+        data_type = None
+        if isinstance(dtype, cudf.core.dtypes.DecimalDtype):
+            tid = pylibcudf.TypeId.DECIMAL128
+            if isinstance(dtype, cudf.core.dtypes.Decimal32Dtype):
+                tid = pylibcudf.TypeId.DECIMAL32
+            elif isinstance(dtype, cudf.core.dtypes.Decimal64Dtype):
+                tid = pylibcudf.TypeId.DECIMAL64
+            data_type = pylibcudf.DataType(tid, -dtype.scale)
+
+        self.c_value = pylibcudf.Scalar.from_arrow(pa_scalar, data_type)
+        self._dtype = dtype
 
     def _to_host_scalar(self):
         is_datetime = self.dtype.kind == "M"
@@ -119,7 +154,8 @@ cdef class DeviceScalar:
 
         null_type = NaT if is_datetime or is_timedelta else NA
 
-        ps = to_arrow_scalar(self)
+        metadata = gather_metadata({"": self.dtype})[0]
+        ps = self.c_value.to_arrow(metadata)
         if not ps.is_valid:
             return null_type
 
@@ -158,13 +194,13 @@ cdef class DeviceScalar:
         return self._to_host_scalar()
 
     cdef const scalar* get_raw_ptr(self) except *:
-        return self.c_value.get()
+        return self.c_value.c_obj.get()
 
     cpdef bool is_valid(self):
         """
         Returns if the Scalar is valid or not(i.e., <NA>).
         """
-        return self.get_raw_ptr()[0].is_valid()
+        return self.c_value.is_valid()
 
     def __repr__(self):
         if cudf.utils.utils.is_na_like(self.value):
@@ -183,7 +219,7 @@ cdef class DeviceScalar:
         cdef DeviceScalar s = DeviceScalar.__new__(DeviceScalar)
         cdef libcudf_types.data_type cdtype
 
-        s.c_value = move(ptr)
+        s.c_value = pylibcudf.Scalar.from_libcudf(move(ptr))
         cdtype = s.get_raw_ptr()[0].type()
 
         if dtype is not None:
@@ -310,9 +346,9 @@ def _create_proxy_nat_scalar(dtype):
     if dtype.char in 'mM':
         nat = dtype.type('NaT').astype(dtype)
         if dtype.type == np.datetime64:
-            _set_datetime64_from_np_scalar(result.c_value, nat, dtype, True)
+            _set_datetime64_from_np_scalar(result.c_value.c_obj, nat, dtype, True)
         elif dtype.type == np.timedelta64:
-            _set_timedelta64_from_np_scalar(result.c_value, nat, dtype, True)
+            _set_timedelta64_from_np_scalar(result.c_value.c_obj, nat, dtype, True)
         return result
     else:
         raise TypeError('NAT only valid for datetime and timedelta')
diff --git a/python/cudf/cudf/_lib/strings/CMakeLists.txt b/python/cudf/cudf/_lib/strings/CMakeLists.txt
index a5e87a456cb..fc11f047ab4 100644
--- a/python/cudf/cudf/_lib/strings/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/strings/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -40,6 +40,14 @@ rapids_cython_create_modules(
   SOURCE_FILES "${cython_sources}"
   LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX strings_ ASSOCIATED_TARGETS cudf
 )
+# TODO: Due to cudf's scalar.pyx needing to cimport pylibcudf's scalar.pyx (because there are parts
+# of cudf Cython that need to directly access the c_obj underlying the pylibcudf Scalar) the
+# requirement for arrow headers infects all of cudf. That requirement will go away once all
+# scalar-related Cython code is removed from cudf.
+foreach(target IN LISTS RAPIDS_CYTHON_CREATED_TARGETS)
+  target_include_directories(${target} PRIVATE "${NumPy_INCLUDE_DIRS}")
+  target_include_directories(${target} PRIVATE "${PYARROW_INCLUDE_DIR}")
+endforeach()
 
 add_subdirectory(convert)
 add_subdirectory(split)
diff --git a/python/cudf/cudf/_lib/strings/convert/CMakeLists.txt b/python/cudf/cudf/_lib/strings/convert/CMakeLists.txt
index 434f79d3b5f..f55bb1fb780 100644
--- a/python/cudf/cudf/_lib/strings/convert/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/strings/convert/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -22,3 +22,11 @@ rapids_cython_create_modules(
   SOURCE_FILES "${cython_sources}"
   LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX strings_ ASSOCIATED_TARGETS cudf
 )
+# TODO: Due to cudf's scalar.pyx needing to cimport pylibcudf's scalar.pyx (because there are parts
+# of cudf Cython that need to directly access the c_obj underlying the pylibcudf Scalar) the
+# requirement for arrow headers infects all of cudf. That requirement will go away once all
+# scalar-related Cython code is removed from cudf.
+foreach(target IN LISTS RAPIDS_CYTHON_CREATED_TARGETS)
+  target_include_directories(${target} PRIVATE "${NumPy_INCLUDE_DIRS}")
+  target_include_directories(${target} PRIVATE "${PYARROW_INCLUDE_DIR}")
+endforeach()
diff --git a/python/cudf/cudf/_lib/strings/split/CMakeLists.txt b/python/cudf/cudf/_lib/strings/split/CMakeLists.txt
index 59a22c06e85..2f2063482af 100644
--- a/python/cudf/cudf/_lib/strings/split/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/strings/split/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -20,3 +20,11 @@ rapids_cython_create_modules(
   SOURCE_FILES "${cython_sources}"
   LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX strings_ ASSOCIATED_TARGETS cudf
 )
+# TODO: Due to cudf's scalar.pyx needing to cimport pylibcudf's scalar.pyx (because there are parts
+# of cudf Cython that need to directly access the c_obj underlying the pylibcudf Scalar) the
+# requirement for arrow headers infects all of cudf. That requirement will go away once all
+# scalar-related Cython code is removed from cudf.
+foreach(target IN LISTS RAPIDS_CYTHON_CREATED_TARGETS)
+  target_include_directories(${target} PRIVATE "${NumPy_INCLUDE_DIRS}")
+  target_include_directories(${target} PRIVATE "${PYARROW_INCLUDE_DIR}")
+endforeach()
diff --git a/python/cudf/cudf/_lib/timezone.pyx b/python/cudf/cudf/_lib/timezone.pyx
index 4d76cbfcdb5..808d1321b0b 100644
--- a/python/cudf/cudf/_lib/timezone.pyx
+++ b/python/cudf/cudf/_lib/timezone.pyx
@@ -1,13 +1,13 @@
 # Copyright (c) 2023, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
+from libcpp.optional cimport make_optional
 from libcpp.string cimport string
 from libcpp.utility cimport move
 
 from cudf._lib.cpp.io.timezone cimport (
     make_timezone_transition_table as cpp_make_timezone_transition_table,
 )
-from cudf._lib.cpp.libcpp.optional cimport make_optional
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.utils cimport columns_from_unique_ptr
 
diff --git a/python/cudf/cudf/_version.py b/python/cudf/cudf/_version.py
new file mode 100644
index 00000000000..ecf6ddd8e3b
--- /dev/null
+++ b/python/cudf/cudf/_version.py
@@ -0,0 +1,20 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import importlib.resources
+
+__version__ = (
+    importlib.resources.files("cudf").joinpath("VERSION").read_text().strip()
+)
+__git_commit__ = ""
diff --git a/python/cudf/cudf/core/buffer/spill_manager.py b/python/cudf/cudf/core/buffer/spill_manager.py
index f056a0fd592..91f3b2cd544 100644
--- a/python/cudf/cudf/core/buffer/spill_manager.py
+++ b/python/cudf/cudf/core/buffer/spill_manager.py
@@ -11,14 +11,20 @@
 import weakref
 from collections import defaultdict
 from dataclasses import dataclass
+from functools import partial
 from typing import Dict, List, Optional, Tuple
 
 import rmm.mr
 
 from cudf.core.buffer.spillable_buffer import SpillableBuffer
 from cudf.options import get_option
+from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
 from cudf.utils.string import format_bytes
 
+_spill_cudf_nvtx_annotate = partial(
+    _cudf_nvtx_annotate, domain="cudf_python-spill"
+)
+
 
 def get_traceback() -> str:
     """Pretty print current traceback to a string"""
@@ -329,6 +335,7 @@ def buffers(
             ret = tuple(sorted(ret, key=lambda b: b.last_accessed))
         return ret
 
+    @_spill_cudf_nvtx_annotate
     def spill_device_memory(self, nbytes: int) -> int:
         """Try to spill device memory
 
diff --git a/python/cudf/cudf/core/buffer/spillable_buffer.py b/python/cudf/cudf/core/buffer/spillable_buffer.py
index 84fb2044c62..1856bec1876 100644
--- a/python/cudf/cudf/core/buffer/spillable_buffer.py
+++ b/python/cudf/cudf/core/buffer/spillable_buffer.py
@@ -20,6 +20,7 @@
     get_ptr_and_size,
     host_memory_allocation,
 )
+from cudf.utils.nvtx_annotation import _get_color_for_nvtx, annotate
 from cudf.utils.string import format_bytes
 
 if TYPE_CHECKING:
@@ -291,8 +292,15 @@ def spill(self, target: str = "cpu") -> None:
                 )
 
             if (ptr_type, target) == ("gpu", "cpu"):
-                host_mem = host_memory_allocation(self.size)
-                rmm._lib.device_buffer.copy_ptr_to_host(self._ptr, host_mem)
+                with annotate(
+                    message="SpillDtoH",
+                    color=_get_color_for_nvtx("SpillDtoH"),
+                    domain="cudf_python-spill",
+                ):
+                    host_mem = host_memory_allocation(self.size)
+                    rmm._lib.device_buffer.copy_ptr_to_host(
+                        self._ptr, host_mem
+                    )
                 self._ptr_desc["memoryview"] = host_mem
                 self._ptr = 0
                 self._owner = None
@@ -302,9 +310,15 @@ def spill(self, target: str = "cpu") -> None:
                 # trigger a new call to this buffer's `spill()`.
                 # Therefore, it is important that spilling-on-demand doesn't
                 # try to unspill an already locked buffer!
-                dev_mem = rmm.DeviceBuffer.to_device(
-                    self._ptr_desc.pop("memoryview")
-                )
+                with annotate(
+                    message="SpillHtoD",
+                    color=_get_color_for_nvtx("SpillHtoD"),
+                    domain="cudf_python-spill",
+                ):
+
+                    dev_mem = rmm.DeviceBuffer.to_device(
+                        self._ptr_desc.pop("memoryview")
+                    )
                 self._ptr = dev_mem.ptr
                 self._owner = dev_mem
                 assert self._size == dev_mem.size
diff --git a/python/cudf/cudf/core/byte_pair_encoding.py b/python/cudf/cudf/core/byte_pair_encoding.py
new file mode 100644
index 00000000000..4c881022ecf
--- /dev/null
+++ b/python/cudf/cudf/core/byte_pair_encoding.py
@@ -0,0 +1,59 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+from __future__ import annotations
+
+import cudf
+from cudf._lib.nvtext.byte_pair_encode import (
+    BPEMergePairs as cpp_merge_pairs,
+    byte_pair_encoding as cpp_byte_pair_encoding,
+)
+
+
+class BytePairEncoder:
+    """
+    Given a merge pairs strings series, performs byte pair encoding on
+    a strings series using the provided separator.
+
+    Parameters
+    ----------
+    merges_pairs : str
+        Strings column of merge pairs
+
+    Returns
+    -------
+    BytePairEncoder
+    """
+
+    def __init__(self, merges_pair: "cudf.Series"):
+        self.merge_pairs = cpp_merge_pairs(merges_pair._column)
+
+    def __call__(self, text, separator: str = " "):
+        """
+
+        Parameters
+        ----------
+        text : cudf string series
+            The strings to be encoded.
+
+        Returns
+        -------
+        Encoded strings
+
+        Examples
+        --------
+        >>> import cudf
+        >>> from cudf.core.byte_pair_encoding import BytePairEncoder
+        >>> mps = cudf.Series(["e n", "i t", "i s", "e s", "en t",
+        ...                    "c e", "es t", "en ce", "T h", "Th is",
+        ...                    "t est", "s ent", "t h", "th is"])
+        >>> bpe = BytePairEncoder(mps)
+        >>> str_series = cudf.Series(['This is the sentence', 'thisisit'])
+        >>> bpe(str_series)
+        0    This is a sent ence
+        1             this is it
+        dtype: object
+        """
+        sep = cudf.Scalar(separator, dtype="str")
+        result = cpp_byte_pair_encoding(text._column, self.merge_pairs, sep)
+
+        return cudf.Series(result)
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index a5e99abd79e..b4f65693d85 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -2102,7 +2102,10 @@ def as_column(
     elif isinstance(arbitrary, (pd.Timestamp, pd.Timedelta)):
         # This will always treat NaTs as nulls since it's not technically a
         # discrete value like NaN
-        data = as_column(pa.array(pd.Series([arbitrary]), from_pandas=True))
+        length = length or 1
+        data = as_column(
+            pa.array(pd.Series([arbitrary] * length), from_pandas=True)
+        )
         if dtype is not None:
             data = data.astype(dtype)
 
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index e3d4b20f141..16eead6ea81 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -97,11 +97,8 @@
     min_scalar_type,
     numeric_normalize_types,
 )
-from cudf.utils.utils import (
-    GetAttrGetItemMixin,
-    _cudf_nvtx_annotate,
-    _external_only_api,
-)
+from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
+from cudf.utils.utils import GetAttrGetItemMixin, _external_only_api
 
 _cupy_nan_methods_map = {
     "min": "nanmin",
@@ -6495,6 +6492,8 @@ def to_parquet(
         max_page_size_rows=None,
         storage_options=None,
         return_metadata=False,
+        use_dictionary=True,
+        header_version="1.0",
         *args,
         **kwargs,
     ):
@@ -6519,6 +6518,8 @@ def to_parquet(
             max_page_size_rows=max_page_size_rows,
             storage_options=storage_options,
             return_metadata=return_metadata,
+            use_dictionary=use_dictionary,
+            header_version=header_version,
             *args,
             **kwargs,
         )
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index f7329d459e9..b2f0651d576 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -46,11 +46,8 @@
 from cudf.utils import ioutils
 from cudf.utils.docutils import copy_docstring
 from cudf.utils.dtypes import can_convert_to_column, find_common_type
-from cudf.utils.utils import (
-    _array_ufunc,
-    _cudf_nvtx_annotate,
-    _warn_no_dask_cudf,
-)
+from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
+from cudf.utils.utils import _array_ufunc, _warn_no_dask_cudf
 
 
 # TODO: It looks like Frame is missing a declaration of `copy`, need to add
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 4b715e962e7..c48e5109ff2 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -29,7 +29,8 @@
 from cudf.core.mixins import Reducible, Scannable
 from cudf.core.multiindex import MultiIndex
 from cudf.core.udf.groupby_utils import _can_be_jitted, jit_groupby_apply
-from cudf.utils.utils import GetAttrGetItemMixin, _cudf_nvtx_annotate
+from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
+from cudf.utils.utils import GetAttrGetItemMixin
 
 
 # The three functions below return the quantiles [25%, 50%, 75%]
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 51a7e9dfe8e..9f0c66a5c74 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -66,11 +66,8 @@
     is_mixed_with_object_dtype,
     numeric_normalize_types,
 )
-from cudf.utils.utils import (
-    _cudf_nvtx_annotate,
-    _warn_no_dask_cudf,
-    search_range,
-)
+from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
+from cudf.utils.utils import _warn_no_dask_cudf, search_range
 
 
 def _lexsorted_equal_range(
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index fef62594fb8..4211a8c24bf 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -69,7 +69,8 @@
 )
 from cudf.utils import docutils
 from cudf.utils._numba import _CUDFNumbaConfig
-from cudf.utils.utils import _cudf_nvtx_annotate, _warn_no_dask_cudf
+from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
+from cudf.utils.utils import _warn_no_dask_cudf
 
 doc_reset_index_template = """
         Reset the index of the {klass}, or a level of it.
diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py
index 6a6e37180ca..b94f8f583f4 100644
--- a/python/cudf/cudf/core/join/join.py
+++ b/python/cudf/cudf/core/join/join.py
@@ -203,6 +203,7 @@ def perform_merge(self) -> cudf.DataFrame:
             if left_rows is not None
             else cudf.DataFrame._from_data({})
         )
+        del left_rows
         right_result = (
             self.rhs._gather(
                 GatherMap.from_column_unchecked(
@@ -213,7 +214,7 @@ def perform_merge(self) -> cudf.DataFrame:
             if right_rows is not None
             else cudf.DataFrame._from_data({})
         )
-
+        del right_rows
         result = cudf.DataFrame._from_data(
             *self._merge_results(left_result, right_result)
         )
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index b4bbd0a8c3c..d0c8a513686 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -26,12 +26,8 @@
 from cudf.core._compat import PANDAS_GE_150
 from cudf.core.frame import Frame
 from cudf.core.index import BaseIndex, _lexsorted_equal_range, as_index
-from cudf.utils.utils import (
-    NotIterable,
-    _cudf_nvtx_annotate,
-    _external_only_api,
-    _is_same_name,
-)
+from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
+from cudf.utils.utils import NotIterable, _external_only_api, _is_same_name
 
 
 def _maybe_indices_to_slice(indices: cp.ndarray) -> Union[slice, cp.ndarray]:
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 6fa5a8fd44b..04a7ed3abf7 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -92,7 +92,7 @@
     is_mixed_with_object_dtype,
     to_cudf_compatible_scalar,
 )
-from cudf.utils.utils import _cudf_nvtx_annotate
+from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
 
 
 def _format_percentile_names(percentiles):
diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py
index d35762c8481..73464238dd4 100644
--- a/python/cudf/cudf/core/single_column_frame.py
+++ b/python/cudf/cudf/core/single_column_frame.py
@@ -20,7 +20,8 @@
 )
 from cudf.core.column import ColumnBase, as_column
 from cudf.core.frame import Frame
-from cudf.utils.utils import NotIterable, _cudf_nvtx_annotate
+from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
+from cudf.utils.utils import NotIterable
 
 
 class SingleColumnFrame(Frame, NotIterable):
diff --git a/python/cudf/cudf/core/udf/groupby_utils.py b/python/cudf/cudf/core/udf/groupby_utils.py
index b18720f5db5..5dbcf455e33 100644
--- a/python/cudf/cudf/core/udf/groupby_utils.py
+++ b/python/cudf/cudf/core/udf/groupby_utils.py
@@ -28,7 +28,7 @@
     _supported_dtypes_from_frame,
 )
 from cudf.utils._numba import _CUDFNumbaConfig
-from cudf.utils.utils import _cudf_nvtx_annotate
+from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
 
 
 def _get_frame_groupby_type(dtype, index_dtype):
diff --git a/python/cudf/cudf/core/udf/utils.py b/python/cudf/cudf/core/udf/utils.py
index 35a3f6c1ffd..7b7ac2b3070 100644
--- a/python/cudf/cudf/core/udf/utils.py
+++ b/python/cudf/cudf/core/udf/utils.py
@@ -39,7 +39,8 @@
     STRING_TYPES,
     TIMEDELTA_TYPES,
 )
-from cudf.utils.utils import _cudf_nvtx_annotate, initfunc
+from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
+from cudf.utils.utils import initfunc
 
 # Maximum size of a string column is 2 GiB
 _STRINGS_UDF_DEFAULT_HEAP_SIZE = os.environ.get(
diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py
index bacc0641639..764885dd7b6 100644
--- a/python/cudf/cudf/io/csv.py
+++ b/python/cudf/cudf/io/csv.py
@@ -11,7 +11,7 @@
 from cudf.api.types import is_scalar
 from cudf.utils import ioutils
 from cudf.utils.dtypes import _maybe_convert_to_default_type
-from cudf.utils.utils import _cudf_nvtx_annotate
+from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
 
 
 @_cudf_nvtx_annotate
diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py
index f51952d23bf..d135a31438e 100644
--- a/python/cudf/cudf/io/orc.py
+++ b/python/cudf/cudf/io/orc.py
@@ -5,7 +5,6 @@
 
 import pyarrow as pa
 from fsspec.utils import stringify_path
-from pyarrow import orc as orc
 
 import cudf
 from cudf._lib import orc as liborc
@@ -17,6 +16,8 @@
 
 
 def _make_empty_df(filepath_or_buffer, columns):
+    from pyarrow import orc
+
     orc_file = orc.ORCFile(filepath_or_buffer)
     schema = orc_file.schema
     col_names = schema.names if columns is None else columns
@@ -150,6 +151,7 @@ def _parse_column_statistics(cs, column_statistics_blob):
 @ioutils.doc_read_orc_metadata()
 def read_orc_metadata(path):
     """{docstring}"""
+    from pyarrow import orc
 
     orc_file = orc.ORCFile(path)
 
@@ -380,6 +382,7 @@ def read_orc(
             )
         )
     else:
+        from pyarrow import orc
 
         def read_orc_stripe(orc_file, stripe, columns):
             pa_table = orc_file.read_stripe(stripe, columns)
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index 81021a5d578..bcc24a85cf9 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -15,14 +15,14 @@
 
 import numpy as np
 import pandas as pd
-from pyarrow import dataset as ds, parquet as pq
+from pyarrow import dataset as ds
 
 import cudf
 from cudf._lib import parquet as libparquet
 from cudf.api.types import is_list_like
 from cudf.core.column import build_categorical_column, column_empty, full
 from cudf.utils import ioutils
-from cudf.utils.utils import _cudf_nvtx_annotate
+from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
 
 BYTE_SIZES = {
     "kb": 1000,
@@ -66,6 +66,8 @@ def _write_parquet(
     partitions_info=None,
     storage_options=None,
     force_nullable_schema=False,
+    header_version="1.0",
+    use_dictionary=True,
 ):
     if is_list_like(paths) and len(paths) > 1:
         if partitions_info is None:
@@ -96,6 +98,8 @@ def _write_parquet(
         "max_page_size_rows": max_page_size_rows,
         "partitions_info": partitions_info,
         "force_nullable_schema": force_nullable_schema,
+        "header_version": header_version,
+        "use_dictionary": use_dictionary,
     }
     if all(ioutils.is_fsspec_open_file(buf) for buf in paths_or_bufs):
         with ExitStack() as stack:
@@ -204,7 +208,6 @@ def write_to_dataset(
     fs.mkdirs(root_path, exist_ok=True)
 
     if partition_cols is not None and len(partition_cols) > 0:
-
         (
             full_paths,
             metadata_file_paths,
@@ -266,6 +269,7 @@ def write_to_dataset(
 @_cudf_nvtx_annotate
 def read_parquet_metadata(path):
     """{docstring}"""
+    import pyarrow.parquet as pq
 
     pq_file = pq.ParquetFile(path)
 
@@ -303,7 +307,9 @@ def _process_dataset(
 
     # Convert filters to ds.Expression
     if filters is not None:
-        filters = pq.filters_to_expression(filters)
+        from pyarrow.parquet import filters_to_expression
+
+        filters = filters_to_expression(filters)
 
     # Initialize ds.FilesystemDataset
     # TODO: Remove the if len(paths) workaround after following bug is fixed:
@@ -709,7 +715,6 @@ def _parquet_to_frame(
     dataset_kwargs=None,
     **kwargs,
 ):
-
     # If this is not a partitioned read, only need
     # one call to `_read_parquet`
     if not partition_keys:
@@ -753,7 +758,7 @@ def _parquet_to_frame(
             )
         )
         # Add partition columns to the last DataFrame
-        for (name, value) in part_key:
+        for name, value in part_key:
             _len = len(dfs[-1])
             if partition_categories and name in partition_categories:
                 # Build the categorical column from `codes`
@@ -866,6 +871,8 @@ def to_parquet(
     storage_options=None,
     return_metadata=False,
     force_nullable_schema=False,
+    header_version="1.0",
+    use_dictionary=True,
     *args,
     **kwargs,
 ):
@@ -940,9 +947,13 @@ def to_parquet(
             partitions_info=partition_info,
             storage_options=storage_options,
             force_nullable_schema=force_nullable_schema,
+            header_version=header_version,
+            use_dictionary=use_dictionary,
         )
 
     else:
+        import pyarrow.parquet as pq
+
         if partition_offsets is not None:
             warnings.warn(
                 "partition_offsets will be ignored when engine is not cudf"
@@ -1040,7 +1051,6 @@ def _get_groups_and_offsets(
     preserve_index=False,
     **kwargs,
 ):
-
     if not (set(df._data) - set(partition_cols)):
         warnings.warn("No data left to save outside partition columns")
 
diff --git a/python/cudf/cudf/io/text.py b/python/cudf/cudf/io/text.py
index eb2c7fa7ef6..0e19972f6e0 100644
--- a/python/cudf/cudf/io/text.py
+++ b/python/cudf/cudf/io/text.py
@@ -1,11 +1,11 @@
-# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+# Copyright (c) 2018-2023, NVIDIA CORPORATION.
 
 from io import BytesIO, StringIO
 
 import cudf
 from cudf._lib import text as libtext
 from cudf.utils import ioutils
-from cudf.utils.utils import _cudf_nvtx_annotate
+from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
 
 
 @_cudf_nvtx_annotate
diff --git a/python/cudf/cudf/pandas/__main__.py b/python/cudf/cudf/pandas/__main__.py
index 02e8e960678..fb8569fa1d0 100644
--- a/python/cudf/cudf/pandas/__main__.py
+++ b/python/cudf/cudf/pandas/__main__.py
@@ -33,7 +33,7 @@ def profile(function_profile, line_profile, fn):
     elif function_profile:
         with Profiler() as profiler:
             yield fn
-        profiler.print_per_func_stats()
+        profiler.print_per_function_stats()
     else:
         yield fn
 
diff --git a/python/cudf/cudf/pandas/module_accelerator.py b/python/cudf/cudf/pandas/module_accelerator.py
index eb35c4adaaf..180d75d96e8 100644
--- a/python/cudf/cudf/pandas/module_accelerator.py
+++ b/python/cudf/cudf/pandas/module_accelerator.py
@@ -10,6 +10,7 @@
 import importlib.abc
 import importlib.machinery
 import os
+import pathlib
 import sys
 import threading
 import warnings
@@ -554,9 +555,10 @@ def getattr_real_or_wrapped(
             frame = sys._getframe()
             # We cannot possibly be at the top level.
             assert frame.f_back
-            calling_module = frame.f_back.f_code.co_filename
+            calling_module = pathlib.PurePath(frame.f_back.f_code.co_filename)
             use_real = any(
-                calling_module.startswith(path) for path in loader._denylist
+                calling_module.is_relative_to(path)
+                for path in loader._denylist
             )
         try:
             if use_real:
diff --git a/python/cudf/cudf/tests/data/parquet/bad_dict.parquet b/python/cudf/cudf/tests/data/parquet/bad_dict.parquet
new file mode 100644
index 00000000000..5008ac0b22b
Binary files /dev/null and b/python/cudf/cudf/tests/data/parquet/bad_dict.parquet differ
diff --git a/python/cudf/cudf/tests/data/parquet/delta_byte_arr.parquet b/python/cudf/cudf/tests/data/parquet/delta_byte_arr.parquet
new file mode 100644
index 00000000000..7f6006a75bf
Binary files /dev/null and b/python/cudf/cudf/tests/data/parquet/delta_byte_arr.parquet differ
diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py
index db0446d506c..0546638f388 100644
--- a/python/cudf/cudf/tests/test_column.py
+++ b/python/cudf/cudf/tests/test_column.py
@@ -193,12 +193,15 @@ def test_column_mixed_dtype(data, error):
 
 
 @pytest.mark.parametrize("nan_as_null", [True, False])
-def test_as_column_scalar_with_nan(nan_as_null):
-    size = 10
-    scalar = np.nan
-
+@pytest.mark.parametrize(
+    "scalar",
+    [np.nan, pd.Timedelta(days=1), pd.Timestamp(2020, 1, 1)],
+    ids=repr,
+)
+@pytest.mark.parametrize("size", [1, 10])
+def test_as_column_scalar_with_nan(nan_as_null, scalar, size):
     expected = (
-        cudf.Series([np.nan] * size, nan_as_null=nan_as_null)
+        cudf.Series([scalar] * size, nan_as_null=nan_as_null)
         .dropna()
         .to_numpy()
     )
diff --git a/python/cudf/cudf/tests/test_decimal.py b/python/cudf/cudf/tests/test_decimal.py
index e4b2af90448..0745e5aba48 100644
--- a/python/cudf/cudf/tests/test_decimal.py
+++ b/python/cudf/cudf/tests/test_decimal.py
@@ -6,6 +6,7 @@
 import numpy as np
 import pyarrow as pa
 import pytest
+from packaging import version
 
 import cudf
 from cudf.core.column import Decimal32Column, Decimal64Column, NumericalColumn
@@ -91,7 +92,15 @@ def test_from_arrow_max_precision_decimal32():
     "to_dtype",
     [Decimal64Dtype(7, 2), Decimal64Dtype(11, 4), Decimal64Dtype(18, 9)],
 )
-def test_typecast_from_float_to_decimal(data, from_dtype, to_dtype):
+def test_typecast_from_float_to_decimal(request, data, from_dtype, to_dtype):
+    request.applymarker(
+        pytest.mark.xfail(
+            condition=version.parse(pa.__version__) >= version.parse("13.0.0")
+            and from_dtype == np.dtype("float32")
+            and to_dtype.precision > 7,
+            reason="https://github.com/rapidsai/cudf/issues/14169",
+        )
+    )
     got = data.astype(from_dtype)
 
     pa_arr = got.to_arrow().cast(
diff --git a/python/cudf/cudf/tests/test_mvc.py b/python/cudf/cudf/tests/test_mvc.py
new file mode 100644
index 00000000000..7dd25ebc500
--- /dev/null
+++ b/python/cudf/cudf/tests/test_mvc.py
@@ -0,0 +1,99 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+import subprocess
+import sys
+
+import pytest
+
+IS_CUDA_11 = False
+IS_CUDA_12 = False
+try:
+    from ptxcompiler.patch import safe_get_versions
+except ModuleNotFoundError:
+    from cudf.utils._ptxcompiler import safe_get_versions
+
+# do not test cuda 12 if pynvjitlink isn't present
+HAVE_PYNVJITLINK = False
+try:
+    import pynvjitlink  # noqa: F401
+
+    HAVE_PYNVJITLINK = True
+except ModuleNotFoundError:
+    pass
+
+
+versions = safe_get_versions()
+driver_version, runtime_version = versions
+
+if (11, 0) <= driver_version < (12, 0):
+    IS_CUDA_11 = True
+if (12, 0) <= driver_version < (13, 0):
+    IS_CUDA_12 = True
+
+
+TEST_BODY = """
+@numba.cuda.jit
+def test_kernel(x):
+    id = numba.cuda.grid(1)
+    if id < len(x):
+        x[id] += 1
+
+s = cudf.Series([1, 2, 3])
+with _CUDFNumbaConfig():
+    test_kernel.forall(len(s))(s)
+"""
+
+CUDA_11_TEST = (
+    """
+import numba.cuda
+import cudf
+from cudf.utils._numba import _CUDFNumbaConfig, patch_numba_linker_cuda_11
+
+
+patch_numba_linker_cuda_11()
+"""
+    + TEST_BODY
+)
+
+
+CUDA_12_TEST = (
+    """
+import numba.cuda
+import cudf
+from cudf.utils._numba import _CUDFNumbaConfig
+from pynvjitlink.patch import (
+    patch_numba_linker as patch_numba_linker_pynvjitlink,
+)
+
+patch_numba_linker_pynvjitlink()
+"""
+    + TEST_BODY
+)
+
+
+@pytest.mark.parametrize(
+    "test",
+    [
+        pytest.param(
+            CUDA_11_TEST,
+            marks=pytest.mark.skipif(
+                not IS_CUDA_11,
+                reason="Minor Version Compatibility test for CUDA 11",
+            ),
+        ),
+        pytest.param(
+            CUDA_12_TEST,
+            marks=pytest.mark.skipif(
+                not IS_CUDA_12 or not HAVE_PYNVJITLINK,
+                reason="Minor Version Compatibility test for CUDA 12",
+            ),
+        ),
+    ],
+)
+def test_numba_mvc(test):
+    cp = subprocess.run(
+        [sys.executable, "-c", test],
+        capture_output=True,
+        cwd="/",
+    )
+
+    assert cp.returncode == 0
diff --git a/python/cudf/cudf/tests/test_no_cuinit.py b/python/cudf/cudf/tests/test_no_cuinit.py
index b142b0dab33..45d812fe9a2 100644
--- a/python/cudf/cudf/tests/test_no_cuinit.py
+++ b/python/cudf/cudf/tests/test_no_cuinit.py
@@ -66,6 +66,7 @@ def test_cudf_import_no_cuinit(cuda_gdb):
         env=env,
         capture_output=True,
         text=True,
+        cwd="/",
     )
 
     cuInit_called = output.stdout.find("in cuInit ()")
diff --git a/python/cudf/cudf/tests/test_numba_import.py b/python/cudf/cudf/tests/test_numba_import.py
deleted file mode 100644
index 238a32a94fa..00000000000
--- a/python/cudf/cudf/tests/test_numba_import.py
+++ /dev/null
@@ -1,48 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
-import subprocess
-import sys
-
-import pytest
-
-IS_CUDA_11 = False
-try:
-    from ptxcompiler.patch import NO_DRIVER, safe_get_versions
-
-    versions = safe_get_versions()
-    if versions != NO_DRIVER:
-        driver_version, runtime_version = versions
-        if driver_version < (12, 0):
-            IS_CUDA_11 = True
-except ModuleNotFoundError:
-    pass
-
-TEST_NUMBA_MVC_ENABLED = """
-import numba.cuda
-import cudf
-from cudf.utils._numba import _CUDFNumbaConfig, _patch_numba_mvc
-
-
-_patch_numba_mvc()
-
-@numba.cuda.jit
-def test_kernel(x):
-    id = numba.cuda.grid(1)
-    if id < len(x):
-        x[id] += 1
-
-s = cudf.Series([1, 2, 3])
-with _CUDFNumbaConfig():
-    test_kernel.forall(len(s))(s)
-"""
-
-
-@pytest.mark.skipif(
-    not IS_CUDA_11, reason="Minor Version Compatibility test for CUDA 11"
-)
-def test_numba_mvc_enabled_cuda_11():
-    cp = subprocess.run(
-        [sys.executable, "-c", TEST_NUMBA_MVC_ENABLED],
-        capture_output=True,
-        cwd="/",
-    )
-    assert cp.returncode == 0
diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index 07aa5430f4f..7407da9c4ac 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -10,8 +10,6 @@
 import numpy as np
 import pandas as pd
 import pyarrow as pa
-import pyarrow.orc
-import pyorc
 import pytest
 
 import cudf
@@ -150,9 +148,11 @@ def test_orc_reader_trailing_nulls(datadir):
     ["TestOrcFile.testDate1900.orc", "TestOrcFile.testDate2038.orc"],
 )
 def test_orc_reader_datetimestamp(datadir, inputfile, use_index):
+    from pyarrow import orc
+
     path = datadir / inputfile
     try:
-        orcfile = pa.orc.ORCFile(path)
+        orcfile = orc.ORCFile(path)
     except pa.ArrowIOError as e:
         pytest.skip(".orc file is not found: %s" % e)
 
@@ -295,28 +295,29 @@ def test_orc_read_rows(datadir, skiprows, num_rows):
 
 def test_orc_read_skiprows():
     buff = BytesIO()
-    data = [
-        True,
-        False,
-        True,
-        False,
-        None,
-        True,
-        True,
-        True,
-        False,
-        None,
-        False,
-        False,
-        True,
-        True,
-        True,
-        True,
-    ]
-    writer = pyorc.Writer(buff, pyorc.Struct(a=pyorc.Boolean()))
-    writer.writerows([(d,) for d in data])
-    writer.close()
-
+    df = pd.DataFrame(
+        {
+            "a": [
+                True,
+                False,
+                True,
+                False,
+                None,
+                True,
+                True,
+                True,
+                False,
+                None,
+                False,
+                False,
+                True,
+                True,
+                True,
+                True,
+            ]
+        }
+    )
+    df.to_orc(buff)
     # testing 10 skiprows due to a boolean specific bug fix that didn't
     # repro for other sizes of data
     skiprows = 10
@@ -605,6 +606,8 @@ def normalized_equals(value1, value2):
 @pytest.mark.parametrize("stats_freq", ["STRIPE", "ROWGROUP"])
 @pytest.mark.parametrize("nrows", [1, 100, 6000000])
 def test_orc_write_statistics(tmpdir, datadir, nrows, stats_freq):
+    from pyarrow import orc
+
     supported_stat_types = supported_numpy_dtypes + ["str"]
     # Can't write random bool columns until issue #6763 is fixed
     if nrows == 6000000:
@@ -623,7 +626,7 @@ def test_orc_write_statistics(tmpdir, datadir, nrows, stats_freq):
     gdf.to_orc(fname.strpath, statistics=stats_freq)
 
     # Read back written ORC's statistics
-    orc_file = pa.orc.ORCFile(fname)
+    orc_file = orc.ORCFile(fname)
     (
         file_stats,
         stripes_stats,
@@ -677,6 +680,8 @@ def test_orc_write_statistics(tmpdir, datadir, nrows, stats_freq):
 @pytest.mark.parametrize("stats_freq", ["STRIPE", "ROWGROUP"])
 @pytest.mark.parametrize("nrows", [2, 100, 6000000])
 def test_orc_chunked_write_statistics(tmpdir, datadir, nrows, stats_freq):
+    from pyarrow import orc
+
     np.random.seed(0)
     supported_stat_types = supported_numpy_dtypes + ["str"]
     # Can't write random bool columns until issue #6763 is fixed
@@ -729,7 +734,7 @@ def test_orc_chunked_write_statistics(tmpdir, datadir, nrows, stats_freq):
     expect = cudf.DataFrame(pd.concat([pdf1, pdf2]).reset_index(drop=True))
 
     # Read back written ORC's statistics
-    orc_file = pa.orc.ORCFile(gdf_fname)
+    orc_file = orc.ORCFile(gdf_fname)
     (
         file_stats,
         stripes_stats,
@@ -782,6 +787,8 @@ def test_orc_chunked_write_statistics(tmpdir, datadir, nrows, stats_freq):
 
 @pytest.mark.parametrize("nrows", [1, 100, 6000000])
 def test_orc_write_bool_statistics(tmpdir, datadir, nrows):
+    from pyarrow import orc
+
     # Make a dataframe
     gdf = cudf.DataFrame({"col_bool": gen_rand_series("bool", nrows)})
     fname = tmpdir.join("gdf.orc")
@@ -790,7 +797,7 @@ def test_orc_write_bool_statistics(tmpdir, datadir, nrows):
     gdf.to_orc(fname.strpath)
 
     # Read back written ORC's statistics
-    orc_file = pa.orc.ORCFile(fname)
+    orc_file = orc.ORCFile(fname)
     (
         file_stats,
         stripes_stats,
@@ -978,44 +985,12 @@ def test_orc_string_stream_offset_issue():
     assert_eq(df, cudf.read_orc(buffer))
 
 
-# Data is generated using pyorc module
 def generate_list_struct_buff(size=100_000):
     rd = random.Random(1)
     np.random.seed(seed=1)
 
     buff = BytesIO()
 
-    schema = {
-        "lvl3_list": pyorc.Array(pyorc.Array(pyorc.Array(pyorc.BigInt()))),
-        "lvl1_list": pyorc.Array(pyorc.BigInt()),
-        "lvl1_struct": pyorc.Struct(
-            **{"a": pyorc.BigInt(), "b": pyorc.BigInt()}
-        ),
-        "lvl2_struct": pyorc.Struct(
-            **{
-                "a": pyorc.BigInt(),
-                "lvl1_struct": pyorc.Struct(
-                    **{"c": pyorc.BigInt(), "d": pyorc.BigInt()}
-                ),
-            }
-        ),
-        "list_nests_struct": pyorc.Array(
-            pyorc.Array(
-                pyorc.Struct(**{"a": pyorc.BigInt(), "b": pyorc.BigInt()})
-            )
-        ),
-        "struct_nests_list": pyorc.Struct(
-            **{
-                "struct": pyorc.Struct(
-                    **{"a": pyorc.BigInt(), "b": pyorc.BigInt()}
-                ),
-                "list": pyorc.Array(pyorc.BigInt()),
-            }
-        ),
-    }
-
-    schema = pyorc.Struct(**schema)
-
     lvl3_list = [
         rd.choice(
             [
@@ -1024,50 +999,57 @@ def generate_list_struct_buff(size=100_000):
                     [
                         [
                             rd.choice([None, np.random.randint(1, 3)])
-                            for z in range(np.random.randint(1, 3))
+                            for _ in range(np.random.randint(1, 3))
                         ]
-                        for z in range(np.random.randint(0, 3))
+                        for _ in range(np.random.randint(0, 3))
                     ]
-                    for y in range(np.random.randint(0, 3))
+                    for _ in range(np.random.randint(0, 3))
                 ],
             ]
         )
-        for x in range(size)
+        for _ in range(size)
     ]
     lvl1_list = [
         [
             rd.choice([None, np.random.randint(0, 3)])
-            for y in range(np.random.randint(1, 4))
+            for _ in range(np.random.randint(1, 4))
         ]
-        for x in range(size)
+        for _ in range(size)
     ]
     lvl1_struct = [
-        rd.choice([None, (np.random.randint(0, 3), np.random.randint(0, 3))])
-        for x in range(size)
+        rd.choice(
+            [
+                None,
+                {"a": np.random.randint(0, 3), "b": np.random.randint(0, 3)},
+            ]
+        )
+        for _ in range(size)
     ]
     lvl2_struct = [
         rd.choice(
             [
                 None,
-                (
-                    rd.choice([None, np.random.randint(0, 3)]),
-                    (
-                        rd.choice([None, np.random.randint(0, 3)]),
-                        np.random.randint(0, 3),
-                    ),
-                ),
+                {"a": rd.choice([None, np.random.randint(0, 3)])},
+                {
+                    "lvl1_struct": {
+                        "c": rd.choice([None, np.random.randint(0, 3)]),
+                        "d": np.random.randint(0, 3),
+                    },
+                },
             ]
         )
-        for x in range(size)
+        for _ in range(size)
     ]
     list_nests_struct = [
         [
-            [rd.choice(lvl1_struct), rd.choice(lvl1_struct)]
-            for y in range(np.random.randint(1, 4))
+            {"a": rd.choice(lvl1_struct), "b": rd.choice(lvl1_struct)}
+            for _ in range(np.random.randint(1, 4))
         ]
-        for x in range(size)
+        for _ in range(size)
+    ]
+    struct_nests_list = [
+        {"struct": lvl1_struct[x], "list": lvl1_list[x]} for x in range(size)
     ]
-    struct_nests_list = [(lvl1_struct[x], lvl1_list[x]) for x in range(size)]
 
     df = pd.DataFrame(
         {
@@ -1080,15 +1062,7 @@ def generate_list_struct_buff(size=100_000):
         }
     )
 
-    writer = pyorc.Writer(buff, schema, stripe_size=1024)
-    tuples = list(
-        map(
-            lambda x: (None,) if x[0] is pd.NA else x,
-            list(df.itertuples(index=False, name=None)),
-        )
-    )
-    writer.writerows(tuples)
-    writer.close()
+    df.to_orc(buff, engine="pyarrow", engine_kwargs={"stripe_size": 1024})
 
     return buff
 
@@ -1109,6 +1083,8 @@ def list_struct_buff():
 @pytest.mark.parametrize("num_rows", [0, 15, 1005, 10561, 100_000])
 @pytest.mark.parametrize("use_index", [True, False])
 def test_lists_struct_nests(columns, num_rows, use_index, list_struct_buff):
+    from pyarrow import orc
+
     gdf = cudf.read_orc(
         list_struct_buff,
         columns=columns,
@@ -1116,7 +1092,7 @@ def test_lists_struct_nests(columns, num_rows, use_index, list_struct_buff):
         use_index=use_index,
     )
 
-    pyarrow_tbl = pyarrow.orc.ORCFile(list_struct_buff).read()
+    pyarrow_tbl = orc.ORCFile(list_struct_buff).read()
 
     pyarrow_tbl = (
         pyarrow_tbl[:num_rows]
@@ -1155,111 +1131,96 @@ def test_pyspark_struct(datadir):
 def gen_map_buff(size=10000):
     from string import ascii_letters as al
 
+    from pyarrow import orc
+
     rd = random.Random(1)
     np.random.seed(seed=1)
 
     buff = BytesIO()
 
-    schema = {
-        "lvl1_map": pyorc.Map(key=pyorc.String(), value=pyorc.BigInt()),
-        "lvl2_map": pyorc.Map(
-            key=pyorc.String(), value=pyorc.Array(pyorc.BigInt())
-        ),
-        "lvl2_struct_map": pyorc.Map(
-            key=pyorc.String(),
-            value=pyorc.Struct(**{"a": pyorc.BigInt(), "b": pyorc.BigInt()}),
-        ),
-    }
-
-    schema = pyorc.Struct(**schema)
-
-    lvl1_map = [
-        rd.choice(
-            [
-                None,
-                [
-                    (
-                        rd.choice(al),
-                        rd.choice([None, np.random.randint(1, 1500)]),
-                    )
-                    for y in range(2)
-                ],
-            ]
-        )
-        for x in range(size)
-    ]
-    lvl2_map = [
-        rd.choice(
-            [
-                None,
+    lvl1_map = pa.array(
+        [
+            rd.choice(
                 [
-                    (
-                        rd.choice(al),
-                        rd.choice(
-                            [
-                                None,
-                                [
-                                    rd.choice(
-                                        [None, np.random.randint(1, 1500)]
-                                    )
-                                    for z in range(5)
-                                ],
-                            ]
+                    None,
+                    {
+                        rd.choice(al): rd.choice(
+                            [None, np.random.randint(1, 1500)]
                         ),
-                    )
-                    for y in range(2)
-                ],
-            ]
-        )
-        for x in range(size)
-    ]
-    lvl2_struct_map = [
-        rd.choice(
-            [
-                None,
+                    },
+                ]
+            )
+            for _ in range(size)
+        ],
+        type=pa.map_(pa.string(), pa.int64()),
+    )
+    lvl2_map = pa.array(
+        [
+            rd.choice(
                 [
-                    (
-                        rd.choice(al),
-                        rd.choice(
-                            [
-                                None,
-                                (
-                                    rd.choice(
-                                        [None, np.random.randint(1, 1500)]
-                                    ),
-                                    rd.choice(
-                                        [None, np.random.randint(1, 1500)]
-                                    ),
-                                ),
-                            ]
-                        ),
-                    )
-                    for y in range(2)
-                ],
-            ]
-        )
-        for x in range(size)
-    ]
-
-    pdf = pd.DataFrame(
-        {
-            "lvl1_map": lvl1_map,
-            "lvl2_map": lvl2_map,
-            "lvl2_struct_map": lvl2_struct_map,
-        }
+                    None,
+                    *(
+                        {
+                            rd.choice(al): rd.choice(
+                                [
+                                    None,
+                                    [
+                                        rd.choice(
+                                            [None, np.random.randint(1, 1500)]
+                                        )
+                                        for _ in range(5)
+                                    ],
+                                ]
+                            )
+                        }
+                        for _ in range(2)
+                    ),
+                ]
+            )
+            for _ in range(size)
+        ],
+        type=pa.map_(pa.string(), pa.list_(pa.int64())),
     )
-    writer = pyorc.Writer(
-        buff, schema, stripe_size=1024, compression=pyorc.CompressionKind.NONE
+    lvl2_struct_map = pa.array(
+        [
+            rd.choice(
+                [
+                    None,
+                    *(
+                        {
+                            rd.choice(al): rd.choice(
+                                [
+                                    None,
+                                    {
+                                        "a": rd.choice(
+                                            [None, np.random.randint(1, 1500)]
+                                        ),
+                                        "b": rd.choice(
+                                            [None, np.random.randint(1, 1500)]
+                                        ),
+                                    },
+                                ]
+                            )
+                        }
+                        for _ in range(2)
+                    ),
+                ]
+            )
+            for _ in range(size)
+        ],
+        type=pa.map_(
+            pa.string(), pa.struct({"a": pa.int64(), "b": pa.int64()})
+        ),
     )
-    tuples = list(
-        map(
-            lambda x: (None,) if x[0] is pd.NA else x,
-            list(pdf.itertuples(index=False, name=None)),
-        )
+
+    pa_table = pa.Table.from_arrays(
+        [lvl1_map, lvl2_map, lvl2_struct_map],
+        ["lvl1_map", "lvl2_map", "lvl2_struct_map"],
     )
 
-    writer.writerows(tuples)
-    writer.close()
+    orc.write_table(
+        pa_table, buff, stripe_size=1024, compression="UNCOMPRESSED"
+    )
 
     return buff
 
@@ -1274,7 +1235,9 @@ def gen_map_buff(size=10000):
 @pytest.mark.parametrize("num_rows", [0, 15, 1005, 10561, 100000])
 @pytest.mark.parametrize("use_index", [True, False])
 def test_map_type_read(columns, num_rows, use_index):
-    tbl = pa.orc.ORCFile(map_buff).read()
+    from pyarrow import orc
+
+    tbl = orc.read_table(map_buff)
 
     lvl1_map = (
         tbl["lvl1_map"]
@@ -1460,18 +1423,22 @@ def test_writer_timestamp_stream_size(datadir, tmpdir):
     ],
 )
 def test_no_row_group_index_orc_read(datadir, fname):
+    from pyarrow import orc
+
     fpath = datadir / fname
 
-    expect = pa.orc.ORCFile(fpath).read()
+    expect = orc.ORCFile(fpath).read()
     got = cudf.read_orc(fpath)
 
     assert expect.equals(got.to_arrow())
 
 
 def test_names_in_struct_dtype_nesting(datadir):
+    from pyarrow import orc
+
     fname = datadir / "TestOrcFile.NestedStructDataFrame.orc"
 
-    expect = pa.orc.ORCFile(fname).read()
+    expect = orc.ORCFile(fname).read()
     got = cudf.read_orc(fname)
 
     # test dataframes
@@ -1483,12 +1450,14 @@ def test_names_in_struct_dtype_nesting(datadir):
 
 
 def test_writer_lists_structs(list_struct_buff):
+    from pyarrow import orc
+
     df_in = cudf.read_orc(list_struct_buff)
 
     buff = BytesIO()
     df_in.to_orc(buff)
 
-    pyarrow_tbl = pyarrow.orc.ORCFile(buff).read()
+    pyarrow_tbl = orc.ORCFile(buff).read()
 
     assert pyarrow_tbl.equals(df_in.to_arrow())
 
@@ -1527,12 +1496,10 @@ def test_statistics_sum_overflow():
     minint64 = np.iinfo(np.int64).min
 
     buff = BytesIO()
-    with pyorc.Writer(
-        buff,
-        pyorc.Struct(a=pyorc.BigInt(), b=pyorc.BigInt(), c=pyorc.BigInt()),
-    ) as writer:
-        writer.write((maxint64, minint64, minint64))
-        writer.write((1, -1, 1))
+    df = pd.DataFrame(
+        {"a": [maxint64, 1], "b": [minint64, -1], "c": [minint64, 1]}
+    )
+    df.to_orc(buff)
 
     file_stats, stripe_stats = cudf.io.orc.read_orc_statistics([buff])
     assert file_stats[0]["a"].get("sum") is None
@@ -1545,22 +1512,24 @@ def test_statistics_sum_overflow():
 
 
 def test_empty_statistics():
+    from pyarrow import orc
+
     buff = BytesIO()
-    orc_schema = pyorc.Struct(
-        a=pyorc.BigInt(),
-        b=pyorc.Double(),
-        c=pyorc.String(),
-        d=pyorc.Decimal(11, 2),
-        e=pyorc.Date(),
-        f=pyorc.Timestamp(),
-        g=pyorc.Boolean(),
-        h=pyorc.Binary(),
-        i=pyorc.BigInt(),
-        # One column with non null value, else cudf/pyorc readers crash
+    pa_table = pa.Table.from_arrays(
+        [
+            pa.array([None], type=pa.int64()),
+            pa.array([None], type=pa.float64()),
+            pa.array([None], type=pa.string()),
+            pa.array([None], type=pa.decimal128(11, 2)),
+            pa.array([None], type=pa.timestamp("ns")),
+            pa.array([None], type=pa.date64()),
+            pa.array([None], type=pa.bool_()),
+            pa.array([None], type=pa.binary()),
+            pa.array([1], type=pa.int64()),
+        ],
+        ["a", "b", "c", "d", "e", "f", "g", "h", "i"],
     )
-    data = tuple([None] * (len(orc_schema.fields) - 1) + [1])
-    with pyorc.Writer(buff, orc_schema) as writer:
-        writer.write(data)
+    orc.write_table(pa_table, buff)
 
     got = cudf.io.orc.read_orc_statistics([buff])
 
@@ -1615,6 +1584,8 @@ def test_select_nested(list_struct_buff, equivalent_columns):
 
 
 def test_orc_writer_rle_stream_size(datadir, tmpdir):
+    from pyarrow import orc
+
     original = datadir / "TestOrcFile.int16.rle.size.orc"
     reencoded = tmpdir.join("int16_map.orc")
 
@@ -1622,7 +1593,7 @@ def test_orc_writer_rle_stream_size(datadir, tmpdir):
     df.to_orc(reencoded)
 
     # Segfaults when RLE stream sizes don't account for varint length
-    pa_out = pa.orc.ORCFile(reencoded).read()
+    pa_out = orc.ORCFile(reencoded).read()
     assert df.to_arrow().equals(pa_out)
 
 
@@ -1642,11 +1613,13 @@ def test_empty_columns():
 
 
 def test_orc_reader_zstd_compression(list_struct_buff):
+    from pyarrow import orc
+
     expected = cudf.read_orc(list_struct_buff)
     # save with ZSTD compression
     buffer = BytesIO()
-    pyarrow_tbl = pyarrow.orc.ORCFile(list_struct_buff).read()
-    writer = pyarrow.orc.ORCWriter(buffer, compression="zstd")
+    pyarrow_tbl = orc.ORCFile(list_struct_buff).read()
+    writer = orc.ORCWriter(buffer, compression="zstd")
     writer.write(pyarrow_tbl)
     writer.close()
     try:
@@ -1845,10 +1818,7 @@ def negative_timestamp_df():
 @pytest.mark.parametrize("engine", ["cudf", "pyarrow"])
 def test_orc_reader_negative_timestamp(negative_timestamp_df, engine):
     buffer = BytesIO()
-    pyorc_table = pa.Table.from_pandas(
-        negative_timestamp_df.to_pandas(), preserve_index=False
-    )
-    pyarrow.orc.write_table(pyorc_table, buffer)
+    negative_timestamp_df.to_orc(buffer)
 
     # We warn the user that this function will fall back to the CPU for reading
     # when the engine is pyarrow.
@@ -1859,11 +1829,13 @@ def test_orc_reader_negative_timestamp(negative_timestamp_df, engine):
 
 
 def test_orc_writer_negative_timestamp(negative_timestamp_df):
+    from pyarrow import orc
+
     buffer = BytesIO()
     negative_timestamp_df.to_orc(buffer)
 
     assert_eq(negative_timestamp_df, pd.read_orc(buffer))
-    assert_eq(negative_timestamp_df, pyarrow.orc.ORCFile(buffer).read())
+    assert_eq(negative_timestamp_df, orc.ORCFile(buffer).read())
 
 
 def test_orc_reader_apache_negative_timestamp(datadir):
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 9349e8c216f..af4d0294293 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -1280,49 +1280,120 @@ def test_parquet_reader_v2(tmpdir, simple_pdf):
     simple_pdf.to_parquet(pdf_fname, data_page_version="2.0")
     assert_eq(cudf.read_parquet(pdf_fname), simple_pdf)
 
+    cudf.from_pandas(simple_pdf).to_parquet(pdf_fname, header_version="2.0")
+    assert_eq(cudf.read_parquet(pdf_fname), simple_pdf)
+
+
+def test_parquet_delta_byte_array(datadir):
+    fname = datadir / "delta_byte_arr.parquet"
+    assert_eq(cudf.read_parquet(fname), pd.read_parquet(fname))
+
+
+def delta_num_rows():
+    return [1, 2, 23, 32, 33, 34, 64, 65, 66, 128, 129, 130, 20000, 50000]
+
 
 @pytest.mark.parametrize("nrows", [1, 100000])
 @pytest.mark.parametrize("add_nulls", [True, False])
-def test_delta_binary(nrows, add_nulls, tmpdir):
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        "int8",
+        "int16",
+        "int32",
+        "int64",
+    ],
+)
+def test_delta_binary(nrows, add_nulls, dtype, tmpdir):
     null_frequency = 0.25 if add_nulls else 0
 
     # Create a pandas dataframe with random data of mixed types
     arrow_table = dg.rand_dataframe(
         dtypes_meta=[
             {
-                "dtype": "int8",
-                "null_frequency": null_frequency,
-                "cardinality": nrows,
-            },
-            {
-                "dtype": "int16",
+                "dtype": dtype,
                 "null_frequency": null_frequency,
                 "cardinality": nrows,
             },
+        ],
+        rows=nrows,
+        seed=0,
+        use_threads=False,
+    )
+    # Roundabout conversion to pandas to preserve nulls/data types
+    cudf_table = cudf.DataFrame.from_arrow(arrow_table)
+    test_pdf = cudf_table.to_pandas(nullable=True)
+    pdf_fname = tmpdir.join("pdfv2.parquet")
+    test_pdf.to_parquet(
+        pdf_fname,
+        version="2.6",
+        column_encoding="DELTA_BINARY_PACKED",
+        data_page_version="2.0",
+        data_page_size=64 * 1024,
+        engine="pyarrow",
+        use_dictionary=False,
+    )
+    cdf = cudf.read_parquet(pdf_fname)
+    pcdf = cudf.from_pandas(test_pdf)
+    assert_eq(cdf, pcdf)
+
+    # Write back out with cudf and make sure pyarrow can read it
+    cudf_fname = tmpdir.join("cudfv2.parquet")
+    pcdf.to_parquet(
+        cudf_fname,
+        compression=None,
+        header_version="2.0",
+        use_dictionary=False,
+    )
+
+    # FIXME(ets): should probably not use more bits than the data type
+    try:
+        cdf2 = cudf.from_pandas(pd.read_parquet(cudf_fname))
+    except OSError as e:
+        if dtype == "int32" and nrows == 100000:
+            pytest.mark.xfail(
+                reason="arrow does not support 33-bit delta encoding"
+            )
+        else:
+            raise e
+    else:
+        assert_eq(cdf2, cdf)
+
+
+@pytest.mark.parametrize("nrows", delta_num_rows())
+@pytest.mark.parametrize("add_nulls", [True, False])
+@pytest.mark.parametrize("str_encoding", ["DELTA_BYTE_ARRAY"])
+def test_delta_byte_array_roundtrip(nrows, add_nulls, str_encoding, tmpdir):
+    null_frequency = 0.25 if add_nulls else 0
+
+    # Create a pandas dataframe with random data of mixed lengths
+    test_pdf = dg.rand_dataframe(
+        dtypes_meta=[
             {
-                "dtype": "int32",
+                "dtype": "str",
                 "null_frequency": null_frequency,
                 "cardinality": nrows,
+                "max_string_length": 10,
             },
             {
-                "dtype": "int64",
+                "dtype": "str",
                 "null_frequency": null_frequency,
                 "cardinality": nrows,
+                "max_string_length": 100,
             },
         ],
         rows=nrows,
         seed=0,
         use_threads=False,
-    )
-    # Roundabout conversion to pandas to preserve nulls/data types
-    cudf_table = cudf.DataFrame.from_arrow(arrow_table)
-    test_pdf = cudf_table.to_pandas(nullable=True)
-    pdf_fname = tmpdir.join("pdfv2.parquet")
+    ).to_pandas()
+
+    pdf_fname = tmpdir.join("pdfdeltaba.parquet")
     test_pdf.to_parquet(
         pdf_fname,
         version="2.6",
-        column_encoding="DELTA_BINARY_PACKED",
+        column_encoding=str_encoding,
         data_page_version="2.0",
+        data_page_size=64 * 1024,
         engine="pyarrow",
         use_dictionary=False,
     )
@@ -1331,6 +1402,58 @@ def test_delta_binary(nrows, add_nulls, tmpdir):
     assert_eq(cdf, pcdf)
 
 
+@pytest.mark.parametrize("nrows", delta_num_rows())
+@pytest.mark.parametrize("add_nulls", [True, False])
+@pytest.mark.parametrize("str_encoding", ["DELTA_BYTE_ARRAY"])
+def test_delta_struct_list(tmpdir, nrows, add_nulls, str_encoding):
+    # Struct<List<List>>
+    lists_per_row = 3
+    list_size = 4
+    num_rows = nrows
+    include_validity = add_nulls
+
+    def list_gen_wrapped(x, y):
+        return list_row_gen(
+            int_gen, x * list_size * lists_per_row, list_size, lists_per_row
+        )
+
+    def string_list_gen_wrapped(x, y):
+        return list_row_gen(
+            string_gen,
+            x * list_size * lists_per_row,
+            list_size,
+            lists_per_row,
+            include_validity,
+        )
+
+    data = struct_gen(
+        [int_gen, string_gen, list_gen_wrapped, string_list_gen_wrapped],
+        0,
+        num_rows,
+        include_validity,
+    )
+    test_pdf = pa.Table.from_pydict({"sol": data}).to_pandas()
+    pdf_fname = tmpdir.join("pdfdeltaba.parquet")
+    test_pdf.to_parquet(
+        pdf_fname,
+        version="2.6",
+        column_encoding={
+            "sol.col0": "DELTA_BINARY_PACKED",
+            "sol.col1": str_encoding,
+            "sol.col2.list.element.list.element": "DELTA_BINARY_PACKED",
+            "sol.col3.list.element.list.element": str_encoding,
+        },
+        data_page_version="2.0",
+        data_page_size=64 * 1024,
+        engine="pyarrow",
+        use_dictionary=False,
+    )
+    # sanity check to verify file is written properly
+    assert_eq(test_pdf, pd.read_parquet(pdf_fname))
+    cdf = cudf.read_parquet(pdf_fname)
+    assert_eq(cdf, cudf.from_pandas(test_pdf))
+
+
 @pytest.mark.parametrize(
     "data",
     [
@@ -1464,7 +1587,6 @@ def test_parquet_writer_int96_timestamps(tmpdir, pdf, gdf):
 
 
 def test_multifile_parquet_folder(tmpdir):
-
     test_pdf1 = make_pdf(nrows=10, nvalids=10 // 2)
     test_pdf2 = make_pdf(nrows=20)
     expect = pd.concat([test_pdf1, test_pdf2])
@@ -2825,6 +2947,14 @@ def test_parquet_reader_unsupported_page_encoding(datadir):
         cudf.read_parquet(fname)
 
 
+def test_parquet_reader_detect_bad_dictionary(datadir):
+    fname = datadir / "bad_dict.parquet"
+
+    # expect a failure when reading the whole file
+    with pytest.raises(RuntimeError):
+        cudf.read_parquet(fname)
+
+
 @pytest.mark.parametrize("data", [{"a": [1, 2, 3, 4]}, {"b": [1, None, 2, 3]}])
 @pytest.mark.parametrize("force_nullable_schema", [True, False])
 def test_parquet_writer_schema_nullability(data, force_nullable_schema):
diff --git a/python/cudf/cudf/tests/test_s3.py b/python/cudf/cudf/tests/test_s3.py
index d54a2eabf22..b92f84b677c 100644
--- a/python/cudf/cudf/tests/test_s3.py
+++ b/python/cudf/cudf/tests/test_s3.py
@@ -533,3 +533,18 @@ def test_write_chunked_parquet(s3_base, s3so):
             actual.sort_values(["b"]).reset_index(drop=True),
             cudf.concat([df1, df2]).sort_values(["b"]).reset_index(drop=True),
         )
+
+
+def test_no_s3fs_on_cudf_import():
+    import subprocess
+    import sys
+
+    output = subprocess.check_output(
+        [
+            sys.executable,
+            "-c",
+            "import cudf; import sys; print('pyarrow._s3fs' in sys.modules)",
+        ],
+        cwd="/",
+    )
+    assert output.strip() == b"False"
diff --git a/python/cudf/cudf/tests/text/test_text_methods.py b/python/cudf/cudf/tests/text/test_text_methods.py
index e565df8f3da..2dccd583b23 100644
--- a/python/cudf/cudf/tests/text/test_text_methods.py
+++ b/python/cudf/cudf/tests/text/test_text_methods.py
@@ -7,6 +7,7 @@
 import pytest
 
 import cudf
+from cudf.core.byte_pair_encoding import BytePairEncoder
 from cudf.core.tokenize_vocabulary import TokenizeVocabulary
 from cudf.testing._utils import assert_eq
 
@@ -1024,3 +1025,43 @@ def test_jaccard_index_random_strings():
 
     actual = str1.str.jaccard_index(str2, jaccard_width)
     assert_eq(expected, actual)
+
+
+@pytest.mark.parametrize(
+    "separator, input, results",
+    [
+        (" ", "thetestsentence", "the test sent ence"),
+        ("_", "sentenceistest", "sent_ence_is_test"),
+        ("$", "istestsentencehere", "is$test$sent$ence$he$r$e"),
+    ],
+)
+def test_byte_pair_encoding(separator, input, results):
+    pairs_table = cudf.Series(
+        [
+            "t he",
+            "h e",
+            "e n",
+            "i t",
+            "i s",
+            "e s",
+            "en t",
+            "c e",
+            "es t",
+            "en ce",
+            "t h",
+            "h i",
+            "th is",
+            "t est",
+            "s i",
+            "s ent",
+        ]
+    )
+    encoder = BytePairEncoder(pairs_table)
+
+    strings = cudf.Series([input, None, "", input])
+
+    expected = cudf.Series([results, None, "", results])
+
+    actual = encoder(strings, separator)
+    assert type(expected) == type(actual)
+    assert_eq(expected, actual)
diff --git a/python/cudf/cudf/utils/_numba.py b/python/cudf/cudf/utils/_numba.py
index 09afb5680bd..bc0d6f37d89 100644
--- a/python/cudf/cudf/utils/_numba.py
+++ b/python/cudf/cudf/utils/_numba.py
@@ -7,6 +7,19 @@
 
 from numba import config as numba_config
 
+try:
+    from pynvjitlink.patch import (
+        patch_numba_linker as patch_numba_linker_pynvjitlink,
+    )
+except ImportError:
+
+    def patch_numba_linker_pynvjitlink():
+        warnings.warn(
+            "CUDA Toolkit is newer than CUDA driver. "
+            "Numba features will not work in this configuration. "
+        )
+
+
 CC_60_PTX_FILE = os.path.join(
     os.path.dirname(__file__), "../core/udf/shim_60.ptx"
 )
@@ -65,7 +78,7 @@ def _get_ptx_file(path, prefix):
         return regular_result[1]
 
 
-def _patch_numba_mvc():
+def patch_numba_linker_cuda_11():
     # Enable the config option for minor version compatibility
     numba_config.CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY = 1
 
@@ -106,29 +119,19 @@ def _setup_numba():
     versions = safe_get_versions()
     if versions != NO_DRIVER:
         driver_version, runtime_version = versions
-        if driver_version >= (12, 0) and runtime_version > driver_version:
-            warnings.warn(
-                f"Using CUDA toolkit version {runtime_version} with CUDA "
-                f"driver version {driver_version} requires minor version "
-                "compatibility, which is not yet supported for CUDA "
-                "driver versions 12.0 and above. It is likely that many "
-                "cuDF operations will not work in this state. Please "
-                f"install CUDA toolkit version {driver_version} to "
-                "continue using cuDF."
-            )
-        else:
-            # Support MVC for all CUDA versions in the 11.x range
-            ptx_toolkit_version = _get_cuda_version_from_ptx_file(
-                CC_60_PTX_FILE
-            )
-            # Numba thinks cubinlinker is only needed if the driver is older
-            # than the CUDA runtime, but when PTX files are present, it might
-            # also need to patch because those PTX files may be compiled by
-            # a CUDA version that is newer than the driver as well
-            if (driver_version < ptx_toolkit_version) or (
-                driver_version < runtime_version
-            ):
-                _patch_numba_mvc()
+        ptx_toolkit_version = _get_cuda_version_from_ptx_file(CC_60_PTX_FILE)
+
+        # MVC is required whenever any PTX is newer than the driver
+        # This could be the shipped PTX file or the PTX emitted by
+        # the version of NVVM on the user system, the latter aligning
+        # with the runtime version
+        if (driver_version < ptx_toolkit_version) or (
+            driver_version < runtime_version
+        ):
+            if driver_version < (12, 0):
+                patch_numba_linker_cuda_11()
+            else:
+                patch_numba_linker_pynvjitlink()
 
 
 def _get_cuda_version_from_ptx_file(path):
@@ -171,6 +174,8 @@ def _get_cuda_version_from_ptx_file(path):
         "7.8": (11, 8),
         "8.0": (12, 0),
         "8.1": (12, 1),
+        "8.2": (12, 2),
+        "8.3": (12, 3),
     }
 
     cuda_ver = ver_map.get(version)
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index 91925bf3c0c..6641bd8290a 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -13,7 +13,6 @@
 import pandas as pd
 from fsspec.core import get_fs_token_paths
 from pyarrow import PythonFile as ArrowPythonFile
-from pyarrow.fs import FSSpecHandler, PyFileSystem
 from pyarrow.lib import NativeFile
 
 from cudf.utils.docutils import docfmt_partial
@@ -288,6 +287,14 @@
     include the file path metadata (relative to `root_path`).
     To request metadata binary blob when using with ``partition_cols``, Pass
     ``return_metadata=True`` instead of specifying ``metadata_file_path``
+use_dictionary : bool, default True
+    When ``False``, prevents the use of dictionary encoding for Parquet page
+    data. When ``True``, dictionary encoding is preferred when not disabled due
+    to dictionary size constraints.
+header_version : {{'1.0', '2.0'}}, default "1.0"
+    Controls whether to use version 1.0 or version 2.0 page headers when
+    encoding. Version 1.0 is more portable, but version 2.0 enables the
+    use of newer encoding schemes.
 force_nullable_schema : bool, default False.
     If True, writes all columns as `null` in schema.
     If False, columns are written as `null` if they contain null values,
@@ -1630,6 +1637,15 @@ def _open_remote_files(
             for path, rgs in zip(paths, row_groups)
         ]
 
+    # Avoid top-level pyarrow.fs import.
+    # Importing pyarrow.fs initializes a S3 SDK with a finalizer
+    # that runs atexit. In some circumstances it appears this
+    # runs a call into a logging system that is already shutdown.
+    # To avoid this, we only import this subsystem if it is
+    # really needed.
+    # See https://github.com/aws/aws-sdk-cpp/issues/2681
+    from pyarrow.fs import FSSpecHandler, PyFileSystem
+
     # Default open - Use pyarrow filesystem API
     pa_fs = PyFileSystem(FSSpecHandler(fs))
     return [
diff --git a/python/cudf/cudf/utils/nvtx_annotation.py b/python/cudf/cudf/utils/nvtx_annotation.py
new file mode 100644
index 00000000000..a4404e51232
--- /dev/null
+++ b/python/cudf/cudf/utils/nvtx_annotation.py
@@ -0,0 +1,30 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+import hashlib
+from functools import partial
+
+from nvtx import annotate
+
+_NVTX_COLORS = ["green", "blue", "purple", "rapids"]
+
+
+def _get_color_for_nvtx(name):
+    m = hashlib.sha256()
+    m.update(name.encode())
+    hash_value = int(m.hexdigest(), 16)
+    idx = hash_value % len(_NVTX_COLORS)
+    return _NVTX_COLORS[idx]
+
+
+def _cudf_nvtx_annotate(func, domain="cudf_python"):
+    """Decorator for applying nvtx annotations to methods in cudf."""
+    return annotate(
+        message=func.__qualname__,
+        color=_get_color_for_nvtx(func.__qualname__),
+        domain=domain,
+    )(func)
+
+
+_dask_cudf_nvtx_annotate = partial(
+    _cudf_nvtx_annotate, domain="dask_cudf_python"
+)
diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py
index ffc3c29c996..ec5693e14d2 100644
--- a/python/cudf/cudf/utils/utils.py
+++ b/python/cudf/cudf/utils/utils.py
@@ -2,16 +2,13 @@
 
 import decimal
 import functools
-import hashlib
 import os
 import traceback
 import warnings
-from functools import partial
 from typing import FrozenSet, Set, Union
 
 import numpy as np
 import pandas as pd
-from nvtx import annotate
 
 import rmm
 
@@ -120,8 +117,6 @@ def _array_ufunc(obj, ufunc, method, inputs, kwargs):
     "__ge__",
 }
 
-_NVTX_COLORS = ["green", "blue", "purple", "rapids"]
-
 # The test root is set by pytest to support situations where tests are run from
 # a source tree on a built version of cudf.
 NO_EXTERNAL_ONLY_APIS = os.getenv("NO_EXTERNAL_ONLY_APIS")
@@ -343,28 +338,6 @@ def is_na_like(obj):
     return obj is None or obj is cudf.NA or obj is cudf.NaT
 
 
-def _get_color_for_nvtx(name):
-    m = hashlib.sha256()
-    m.update(name.encode())
-    hash_value = int(m.hexdigest(), 16)
-    idx = hash_value % len(_NVTX_COLORS)
-    return _NVTX_COLORS[idx]
-
-
-def _cudf_nvtx_annotate(func, domain="cudf_python"):
-    """Decorator for applying nvtx annotations to methods in cudf."""
-    return annotate(
-        message=func.__qualname__,
-        color=_get_color_for_nvtx(func.__qualname__),
-        domain=domain,
-    )(func)
-
-
-_dask_cudf_nvtx_annotate = partial(
-    _cudf_nvtx_annotate, domain="dask_cudf_python"
-)
-
-
 def _warn_no_dask_cudf(fn):
     @functools.wraps(fn)
     def wrapper(self):
diff --git a/python/cudf/cudf_pandas_tests/data/profile_basic.py b/python/cudf/cudf_pandas_tests/data/profile_basic.py
new file mode 100644
index 00000000000..f7b4ba89ce7
--- /dev/null
+++ b/python/cudf/cudf_pandas_tests/data/profile_basic.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+import pandas as pd
+
+df = pd.DataFrame(
+    {
+        "size": [10, 11, 12, 10, 11, 12, 10, 6, 11, 10],
+        "total_bill": [100, 200, 100, 200, 100, 100, 200, 50, 10, 560],
+    }
+)
+df["size"].value_counts()
+df.groupby("size").total_bill.mean()
+df.apply(list, axis=1)
diff --git a/python/cudf/cudf_pandas_tests/test_profiler.py b/python/cudf/cudf_pandas_tests/test_profiler.py
index a947d67b724..4921446ab6b 100644
--- a/python/cudf/cudf_pandas_tests/test_profiler.py
+++ b/python/cudf/cudf_pandas_tests/test_profiler.py
@@ -2,6 +2,9 @@
 # All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
+import os
+import subprocess
+
 from cudf.pandas import LOADED, Profiler
 
 if not LOADED:
@@ -68,3 +71,41 @@ def test_profiler_fast_slow_name_mismatch():
     with Profiler():
         df = pd.DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]})
         df.iloc[0, 1] = "foo"
+
+
+def test_profiler_commandline():
+    data_directory = os.path.dirname(os.path.abspath(__file__))
+    # Create a copy of the current environment variables
+    env = os.environ.copy()
+    # Setting the 'COLUMNS' environment variable to a large number
+    # because the terminal output shouldn't be compressed for
+    # text validations below.
+    env["COLUMNS"] = "10000"
+
+    sp_completed = subprocess.run(
+        [
+            "python",
+            "-m",
+            "cudf.pandas",
+            "--profile",
+            data_directory + "/data/profile_basic.py",
+        ],
+        capture_output=True,
+        text=True,
+        env=env,
+    )
+    assert sp_completed.returncode == 0
+    output = sp_completed.stdout
+
+    for string in [
+        "Total time",
+        "Stats",
+        "Function",
+        "GPU ncalls",
+        "GPU cumtime",
+        "GPU percall",
+        "CPU ncalls",
+        "CPU cumtime",
+        "CPU percall",
+    ]:
+        assert string in output
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index a1fec83c1b9..b38970271d7 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -4,12 +4,12 @@
 build-backend = "setuptools.build_meta"
 requires = [
     "cmake>=3.26.4",
-    "cython>=3.0.0",
+    "cython>=3.0.3",
     "ninja",
     "numpy>=1.21,<1.25",
     "protoc-wheel",
-    "pyarrow==12.0.1.*",
-    "rmm==23.10.*",
+    "pyarrow==14.0.1.*",
+    "rmm==23.12.*",
     "scikit-build>=0.13.1",
     "setuptools",
     "wheel",
@@ -17,7 +17,7 @@ requires = [
 
 [project]
 name = "cudf"
-version = "23.10.00"
+dynamic = ["version"]
 description = "cuDF - GPU Dataframe"
 readme = { file = "README.md", content-type = "text/markdown" }
 authors = [
@@ -38,9 +38,9 @@ dependencies = [
     "pandas>=1.3,<1.6.0dev0",
     "protobuf>=4.21,<5",
     "ptxcompiler",
-    "pyarrow==12.*",
+    "pyarrow>=14.0.1,<15.0.0a0",
     "rich",
-    "rmm==23.10.*",
+    "rmm==23.12.*",
     "typing_extensions>=4.0.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
@@ -60,7 +60,6 @@ test = [
     "hypothesis",
     "mimesis>=4.1.0",
     "msgpack",
-    "pyorc",
     "pytest",
     "pytest-benchmark",
     "pytest-cases",
@@ -127,6 +126,9 @@ Documentation = "https://docs.rapids.ai/api/cudf/stable/"
 [tool.setuptools]
 license-files = ["LICENSE"]
 
+[tool.setuptools.dynamic]
+version = {file = "cudf/VERSION"}
+
 [tool.isort]
 line_length = 79
 multi_line_output = 3
diff --git a/python/cudf/setup.py b/python/cudf/setup.py
index 96b91b4ccc0..984cd63a7c9 100644
--- a/python/cudf/setup.py
+++ b/python/cudf/setup.py
@@ -6,6 +6,8 @@
 packages = find_packages(include=["cudf*", "udf_cpp*"])
 setup(
     packages=packages,
-    package_data={key: ["*.pxd", "*.hpp", "*.cuh"] for key in packages},
+    package_data={
+        key: ["VERSION", "*.pxd", "*.hpp", "*.cuh"] for key in packages
+    },
     zip_safe=False,
 )
diff --git a/python/cudf_kafka/CMakeLists.txt b/python/cudf_kafka/CMakeLists.txt
new file mode 100644
index 00000000000..d55c3fdc076
--- /dev/null
+++ b/python/cudf_kafka/CMakeLists.txt
@@ -0,0 +1,47 @@
+# =============================================================================
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+cmake_minimum_required(VERSION 3.26.4 FATAL_ERROR)
+
+set(cudf_kafka_version 23.12.00)
+
+include(../../fetch_rapids.cmake)
+
+project(
+  cudf-kafka-python
+  VERSION ${cudf_kafka_version}
+  LANGUAGES # TODO: Building Python extension modules via the python_extension_module requires the C
+            # language to be enabled here. The test project that is built in scikit-build to verify
+            # various linking options for the python library is hardcoded to build with C, so until
+            # that is fixed we need to keep C.
+            C CXX
+)
+
+find_package(cudf_kafka ${cudf_kafka_version} REQUIRED)
+
+if(NOT cudf_kafka_FOUND)
+  message(
+    FATAL_ERROR
+      "cudf_kafka package not found. cudf_kafka C++ is required to build this Python package."
+  )
+endif()
+
+include(rapids-cython)
+rapids_cython_init()
+
+add_subdirectory(cudf_kafka/_lib)
+
+if(DEFINED cython_lib_dir)
+  rapids_cython_add_rpath_entries(TARGET cudf_kafka PATHS "${cython_lib_dir}")
+endif()
diff --git a/python/cudf_kafka/LICENSE b/python/cudf_kafka/LICENSE
new file mode 120000
index 00000000000..30cff7403da
--- /dev/null
+++ b/python/cudf_kafka/LICENSE
@@ -0,0 +1 @@
+../../LICENSE
\ No newline at end of file
diff --git a/python/cudf_kafka/README.md b/python/cudf_kafka/README.md
new file mode 120000
index 00000000000..fe840054137
--- /dev/null
+++ b/python/cudf_kafka/README.md
@@ -0,0 +1 @@
+../../README.md
\ No newline at end of file
diff --git a/python/cudf_kafka/cudf_kafka/VERSION b/python/cudf_kafka/cudf_kafka/VERSION
new file mode 120000
index 00000000000..d62dc733efd
--- /dev/null
+++ b/python/cudf_kafka/cudf_kafka/VERSION
@@ -0,0 +1 @@
+../../../VERSION
\ No newline at end of file
diff --git a/python/cudf_kafka/cudf_kafka/__init__.py b/python/cudf_kafka/cudf_kafka/__init__.py
index e69de29bb2d..43a91af9cf5 100644
--- a/python/cudf_kafka/cudf_kafka/__init__.py
+++ b/python/cudf_kafka/cudf_kafka/__init__.py
@@ -0,0 +1,3 @@
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+
+from ._version import __git_commit__, __version__
diff --git a/python/cudf_kafka/cudf_kafka/_lib/CMakeLists.txt b/python/cudf_kafka/cudf_kafka/_lib/CMakeLists.txt
new file mode 100644
index 00000000000..3262b7d5ebe
--- /dev/null
+++ b/python/cudf_kafka/cudf_kafka/_lib/CMakeLists.txt
@@ -0,0 +1,62 @@
+# =============================================================================
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+set(cython_sources kafka.pyx)
+set(linked_libraries cudf_kafka::cudf_kafka)
+
+rapids_cython_create_modules(
+  CXX ASSOCIATED_TARGETS cudf_kafka
+  SOURCE_FILES "${cython_sources}"
+  LINKED_LIBRARIES "${linked_libraries}"
+)
+
+# TODO: Finding NumPy currently requires finding Development due to a bug in CMake. This bug was
+# fixed in https://gitlab.kitware.com/cmake/cmake/-/merge_requests/7410 and will be available in
+# CMake 3.24, so we can remove the Development component once we upgrade to CMake 3.24.
+# find_package(Python REQUIRED COMPONENTS Development NumPy)
+
+# Note: The bug noted above prevents us from finding NumPy successfully using FindPython.cmake
+# inside the manylinux images used to build wheels because manylinux images do not contain
+# libpython.so and therefore Development cannot be found. Until we upgrade to CMake 3.24, we should
+# use FindNumpy.cmake instead (provided by scikit-build). When we switch to 3.24 we can try
+# switching back, but it may not work if that implicitly still requires Python libraries. In that
+# case we'll need to follow up with the CMake team to remove that dependency.  The stopgap solution
+# is to unpack the static lib tarballs in the wheel building jobs so that there are at least static
+# libs to be found, but that should be a last resort since it implies a dependency that isn't really
+# necessary. The relevant command is tar -xf /opt/_internal/static-libs-for-embedding-only.tar.xz -C
+# /opt/_internal"
+find_package(NumPy REQUIRED)
+
+find_package(Python 3.9 REQUIRED COMPONENTS Interpreter)
+
+execute_process(
+  COMMAND "${Python_EXECUTABLE}" -c "import pyarrow; print(pyarrow.get_include())"
+  OUTPUT_VARIABLE PYARROW_INCLUDE_DIR
+  ERROR_VARIABLE PYARROW_ERROR
+  RESULT_VARIABLE PYARROW_RESULT
+  OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+
+if(${PYARROW_RESULT})
+  message(FATAL_ERROR "Error while trying to obtain pyarrow include directory:\n${PYARROW_ERROR}")
+endif()
+
+# TODO: Due to cudf's scalar.pyx needing to cimport pylibcudf's scalar.pyx (because there are parts
+# of cudf Cython that need to directly access the c_obj underlying the pylibcudf Scalar) the
+# requirement for arrow headers infects all of cudf. That in turn requires including numpy headers.
+# These requirements will go away once all scalar-related Cython code is removed from cudf.
+foreach(target IN LISTS RAPIDS_CYTHON_CREATED_TARGETS)
+  target_include_directories(${target} PRIVATE "${NumPy_INCLUDE_DIRS}")
+  target_include_directories(${target} PRIVATE "${PYARROW_INCLUDE_DIR}")
+endforeach()
diff --git a/python/cudf_kafka/cudf_kafka/_lib/kafka.pxd b/python/cudf_kafka/cudf_kafka/_lib/kafka.pxd
index ca729c62512..068837d04ee 100644
--- a/python/cudf_kafka/cudf_kafka/_lib/kafka.pxd
+++ b/python/cudf_kafka/cudf_kafka/_lib/kafka.pxd
@@ -11,12 +11,12 @@ from cudf._lib.cpp.io.datasource cimport datasource
 from cudf._lib.io.datasource cimport Datasource
 
 
-cdef extern from "kafka_callback.hpp" \
+cdef extern from "cudf_kafka/kafka_callback.hpp" \
         namespace "cudf::io::external::kafka" nogil:
     ctypedef object (*python_callable_type)()
 
 
-cdef extern from "kafka_consumer.hpp" \
+cdef extern from "cudf_kafka/kafka_consumer.hpp" \
         namespace "cudf::io::external::kafka" nogil:
 
     cpdef cppclass kafka_consumer:
diff --git a/python/cudf_kafka/cudf_kafka/_lib/kafka.pyx b/python/cudf_kafka/cudf_kafka/_lib/kafka.pyx
index 4d732478723..2fbaacff7c6 100644
--- a/python/cudf_kafka/cudf_kafka/_lib/kafka.pyx
+++ b/python/cudf_kafka/cudf_kafka/_lib/kafka.pyx
@@ -3,12 +3,11 @@
 from libc.stdint cimport int32_t, int64_t
 from libcpp cimport bool, nullptr
 from libcpp.map cimport map
-from libcpp.memory cimport unique_ptr
+from libcpp.memory cimport make_unique, unique_ptr
 from libcpp.string cimport string
 from libcpp.utility cimport move
 
 from cudf._lib.cpp.io.datasource cimport datasource
-from cudf._lib.cpp.libcpp.memory cimport make_unique
 
 from cudf_kafka._lib.kafka cimport kafka_consumer
 
diff --git a/python/cudf_kafka/cudf_kafka/_version.py b/python/cudf_kafka/cudf_kafka/_version.py
new file mode 100644
index 00000000000..5adab566da0
--- /dev/null
+++ b/python/cudf_kafka/cudf_kafka/_version.py
@@ -0,0 +1,23 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import importlib.resources
+
+__version__ = (
+    importlib.resources.files("cudf_kafka")
+    .joinpath("VERSION")
+    .read_text()
+    .strip()
+)
+__git_commit__ = ""
diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml
index 386cdc32ab1..15431161d75 100644
--- a/python/cudf_kafka/pyproject.toml
+++ b/python/cudf_kafka/pyproject.toml
@@ -3,16 +3,17 @@
 [build-system]
 
 requires = [
-    "cython>=3.0.0",
+    "cython>=3.0.3",
     "numpy>=1.21,<1.25",
-    "pyarrow==12.0.1.*",
+    "pyarrow==14.0.1.*",
+    "scikit-build>=0.13.1",
     "setuptools",
     "wheel",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project]
 name = "cudf_kafka"
-version = "23.10.00"
+dynamic = ["version"]
 description = "cuDF Kafka Datasource"
 readme = { file = "README.md", content-type = "text/markdown" }
 authors = [
@@ -21,7 +22,7 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
-    "cudf==23.10.*",
+    "cudf==23.12.*",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project.optional-dependencies]
@@ -38,6 +39,9 @@ Documentation = "https://docs.rapids.ai/api/cudf/stable/"
 [tool.setuptools]
 license-files = ["LICENSE"]
 
+[tool.setuptools.dynamic]
+version = {file = "cudf_kafka/VERSION"}
+
 [tool.isort]
 line_length = 79
 multi_line_output = 3
diff --git a/python/cudf_kafka/setup.py b/python/cudf_kafka/setup.py
index d955d95858a..6a99e9ed968 100644
--- a/python/cudf_kafka/setup.py
+++ b/python/cudf_kafka/setup.py
@@ -1,96 +1,13 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
-import os
-import shutil
-import sysconfig
-from distutils.sysconfig import get_python_lib
-
-import numpy as np
-import pyarrow as pa
-from Cython.Build import cythonize
-from setuptools import find_packages, setup
-from setuptools.extension import Extension
-
-cython_files = ["cudf_kafka/_lib/*.pyx"]
-
-CUDA_HOME = os.environ.get("CUDA_HOME", False)
-if not CUDA_HOME:
-    path_to_cuda_gdb = shutil.which("cuda-gdb")
-    if path_to_cuda_gdb is None:
-        raise OSError(
-            "Could not locate CUDA. "
-            "Please set the environment variable "
-            "CUDA_HOME to the path to the CUDA installation "
-            "and try again."
-        )
-    CUDA_HOME = os.path.dirname(os.path.dirname(path_to_cuda_gdb))
-
-if not os.path.isdir(CUDA_HOME):
-    raise OSError(f"Invalid CUDA_HOME: directory does not exist: {CUDA_HOME}")
-
-cuda_include_dir = os.path.join(CUDA_HOME, "include")
-
-CUDF_ROOT = os.environ.get(
-    "CUDF_ROOT",
-    os.path.abspath(
-        os.path.join(
-            os.path.dirname(os.path.abspath(__file__)), "../../cpp/build/"
-        )
-    ),
-)
-CUDF_KAFKA_ROOT = os.environ.get(
-    "CUDF_KAFKA_ROOT", "../../cpp/libcudf_kafka/build"
-)
-
-try:
-    nthreads = int(os.environ.get("PARALLEL_LEVEL", "0") or "0")
-except Exception:
-    nthreads = 0
-
-extensions = [
-    Extension(
-        "*",
-        sources=cython_files,
-        include_dirs=[
-            os.path.abspath(os.path.join(CUDF_ROOT, "../include/cudf")),
-            os.path.abspath(os.path.join(CUDF_ROOT, "../include")),
-            os.path.abspath(
-                os.path.join(CUDF_ROOT, "../libcudf_kafka/include/cudf_kafka")
-            ),
-            os.path.join(CUDF_ROOT, "include"),
-            os.path.join(CUDF_ROOT, "_deps/libcudacxx-src/include"),
-            os.path.join(
-                os.path.dirname(sysconfig.get_path("include")),
-                "rapids/libcudacxx",
-            ),
-            os.path.dirname(sysconfig.get_path("include")),
-            np.get_include(),
-            pa.get_include(),
-            cuda_include_dir,
-        ],
-        library_dirs=(
-            [
-                get_python_lib(),
-                os.path.join(os.sys.prefix, "lib"),
-                CUDF_KAFKA_ROOT,
-            ]
-        ),
-        libraries=["cudf", "cudf_kafka"],
-        language="c++",
-        extra_compile_args=["-std=c++17", "-DFMT_HEADER_ONLY=1"],
-    )
-]
+# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+from setuptools import find_packages
+from skbuild import setup
 
 packages = find_packages(include=["cudf_kafka*"])
+
 setup(
-    # Include the separately-compiled shared library
-    ext_modules=cythonize(
-        extensions,
-        nthreads=nthreads,
-        compiler_directives=dict(
-            profile=False, language_level=3, embedsignature=True
-        ),
-    ),
     packages=packages,
-    package_data={key: ["*.pxd"] for key in packages},
+    package_data={
+        key: ["VERSION", "*.pxd", "*.hpp", "*.cuh"] for key in packages
+    },
     zip_safe=False,
 )
diff --git a/python/custreamz/custreamz/VERSION b/python/custreamz/custreamz/VERSION
new file mode 120000
index 00000000000..d62dc733efd
--- /dev/null
+++ b/python/custreamz/custreamz/VERSION
@@ -0,0 +1 @@
+../../../VERSION
\ No newline at end of file
diff --git a/python/custreamz/custreamz/__init__.py b/python/custreamz/custreamz/__init__.py
index 52be76aab1f..3f11da14684 100644
--- a/python/custreamz/custreamz/__init__.py
+++ b/python/custreamz/custreamz/__init__.py
@@ -1,3 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
+from ._version import __git_commit__, __version__
 from .kafka import Consumer
diff --git a/python/custreamz/custreamz/_version.py b/python/custreamz/custreamz/_version.py
new file mode 100644
index 00000000000..0f545f95f2b
--- /dev/null
+++ b/python/custreamz/custreamz/_version.py
@@ -0,0 +1,23 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import importlib.resources
+
+__version__ = (
+    importlib.resources.files("custreamz")
+    .joinpath("VERSION")
+    .read_text()
+    .strip()
+)
+__git_commit__ = ""
diff --git a/python/custreamz/pyproject.toml b/python/custreamz/pyproject.toml
index 47ade91b5eb..2d0059d5aa9 100644
--- a/python/custreamz/pyproject.toml
+++ b/python/custreamz/pyproject.toml
@@ -9,7 +9,7 @@ requires = [
 
 [project]
 name = "custreamz"
-version = "23.10.00"
+dynamic = ["version"]
 description = "cuStreamz - GPU Accelerated Streaming"
 readme = { file = "README.md", content-type = "text/markdown" }
 authors = [
@@ -19,8 +19,8 @@ license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
     "confluent-kafka>=1.9.0,<1.10.0a0",
-    "cudf==23.10.*",
-    "cudf_kafka==23.10.*",
+    "cudf==23.12.*",
+    "cudf_kafka==23.12.*",
     "streamz",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
@@ -48,6 +48,9 @@ Homepage = "https://github.com/rapidsai/cudf"
 license-files = ["LICENSE"]
 zip-safe = false
 
+[tool.setuptools.dynamic]
+version = {file = "custreamz/VERSION"}
+
 [tools.setuptools.packages.find]
 include = [
     "custreamz",
diff --git a/python/custreamz/setup.py b/python/custreamz/setup.py
index 2fa45ac8087..04943bf88e2 100644
--- a/python/custreamz/setup.py
+++ b/python/custreamz/setup.py
@@ -2,4 +2,6 @@
 
 from setuptools import setup
 
-setup()
+setup(
+    package_data={"custreamz": ["VERSION"]},
+)
diff --git a/python/dask_cudf/dask_cudf/VERSION b/python/dask_cudf/dask_cudf/VERSION
new file mode 120000
index 00000000000..d62dc733efd
--- /dev/null
+++ b/python/dask_cudf/dask_cudf/VERSION
@@ -0,0 +1 @@
+../../../VERSION
\ No newline at end of file
diff --git a/python/dask_cudf/dask_cudf/__init__.py b/python/dask_cudf/dask_cudf/__init__.py
index 6952c3d5882..c152a9e6a81 100644
--- a/python/dask_cudf/dask_cudf/__init__.py
+++ b/python/dask_cudf/dask_cudf/__init__.py
@@ -5,6 +5,7 @@
 import cudf
 
 from . import backends
+from ._version import __git_commit__, __version__
 from .core import DataFrame, Series, concat, from_cudf, from_dask_dataframe
 from .groupby import groupby_agg
 from .io import read_csv, read_json, read_orc, read_text, to_orc
@@ -14,8 +15,6 @@
 except ImportError:
     pass
 
-__version__ = "23.10.00"
-
 __all__ = [
     "DataFrame",
     "Series",
diff --git a/python/dask_cudf/dask_cudf/_version.py b/python/dask_cudf/dask_cudf/_version.py
new file mode 100644
index 00000000000..0dd62854a4e
--- /dev/null
+++ b/python/dask_cudf/dask_cudf/_version.py
@@ -0,0 +1,23 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import importlib.resources
+
+__version__ = (
+    importlib.resources.files("dask_cudf")
+    .joinpath("VERSION")
+    .read_text()
+    .strip()
+)
+__git_commit__ = ""
diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py
index 2be256f85e8..387643587d1 100644
--- a/python/dask_cudf/dask_cudf/backends.py
+++ b/python/dask_cudf/dask_cudf/backends.py
@@ -12,6 +12,7 @@
 
 import dask.dataframe as dd
 from dask import config
+from dask.array.dispatch import percentile_lookup
 from dask.dataframe.backends import (
     DataFrameBackendEntrypoint,
     PandasBackendEntrypoint,
@@ -42,7 +43,7 @@
 
 import cudf
 from cudf.api.types import is_string_dtype
-from cudf.utils.utils import _dask_cudf_nvtx_annotate
+from cudf.utils.nvtx_annotation import _dask_cudf_nvtx_annotate
 
 from .core import DataFrame, Index, Series
 
@@ -320,56 +321,45 @@ def get_grouper_cudf(obj):
     return cudf.core.groupby.Grouper
 
 
-try:
-    try:
-        from dask.array.dispatch import percentile_lookup
-    except ImportError:
-        from dask.dataframe.dispatch import (
-            percentile_dispatch as percentile_lookup,
-        )
-
-    @percentile_lookup.register((cudf.Series, cp.ndarray, cudf.BaseIndex))
-    @_dask_cudf_nvtx_annotate
-    def percentile_cudf(a, q, interpolation="linear"):
-        # Cudf dispatch to the equivalent of `np.percentile`:
-        # https://numpy.org/doc/stable/reference/generated/numpy.percentile.html
-        a = cudf.Series(a)
-        # a is series.
-        n = len(a)
-        if not len(a):
-            return None, n
-        if isinstance(q, Iterator):
-            q = list(q)
-
-        if cudf.api.types.is_categorical_dtype(a.dtype):
-            result = cp.percentile(a.cat.codes, q, interpolation=interpolation)
-
-            return (
-                pd.Categorical.from_codes(
-                    result, a.dtype.categories, a.dtype.ordered
-                ),
-                n,
-            )
-        if np.issubdtype(a.dtype, np.datetime64):
-            result = a.quantile(
-                [i / 100.0 for i in q], interpolation=interpolation
-            )
+@percentile_lookup.register((cudf.Series, cp.ndarray, cudf.BaseIndex))
+@_dask_cudf_nvtx_annotate
+def percentile_cudf(a, q, interpolation="linear"):
+    # Cudf dispatch to the equivalent of `np.percentile`:
+    # https://numpy.org/doc/stable/reference/generated/numpy.percentile.html
+    a = cudf.Series(a)
+    # a is series.
+    n = len(a)
+    if not len(a):
+        return None, n
+    if isinstance(q, Iterator):
+        q = list(q)
+
+    if cudf.api.types.is_categorical_dtype(a.dtype):
+        result = cp.percentile(a.cat.codes, q, interpolation=interpolation)
 
-            if q[0] == 0:
-                # https://github.com/dask/dask/issues/6864
-                result[0] = min(result[0], a.min())
-            return result.to_pandas(), n
-        if not np.issubdtype(a.dtype, np.number):
-            interpolation = "nearest"
         return (
-            a.quantile(
-                [i / 100.0 for i in q], interpolation=interpolation
-            ).to_pandas(),
+            pd.Categorical.from_codes(
+                result, a.dtype.categories, a.dtype.ordered
+            ),
             n,
         )
+    if np.issubdtype(a.dtype, np.datetime64):
+        result = a.quantile(
+            [i / 100.0 for i in q], interpolation=interpolation
+        )
 
-except ImportError:
-    pass
+        if q[0] == 0:
+            # https://github.com/dask/dask/issues/6864
+            result[0] = min(result[0], a.min())
+        return result.to_pandas(), n
+    if not np.issubdtype(a.dtype, np.number):
+        interpolation = "nearest"
+    return (
+        a.quantile(
+            [i / 100.0 for i in q], interpolation=interpolation
+        ).to_pandas(),
+        n,
+    )
 
 
 @pyarrow_schema_dispatch.register((cudf.DataFrame,))
@@ -486,6 +476,31 @@ def sizeof_cudf_series_index(obj):
     return obj.memory_usage()
 
 
+# TODO: Remove try/except when cudf is pinned to dask>=2023.10.0
+try:
+    from dask.dataframe.dispatch import partd_encode_dispatch
+
+    @partd_encode_dispatch.register(cudf.DataFrame)
+    def _simple_cudf_encode(_):
+        # Basic pickle-based encoding for a partd k-v store
+        import pickle
+        from functools import partial
+
+        import partd
+
+        def join(dfs):
+            if not dfs:
+                return cudf.DataFrame()
+            else:
+                return cudf.concat(dfs)
+
+        dumps = partial(pickle.dumps, protocol=pickle.HIGHEST_PROTOCOL)
+        return partial(partd.Encode, dumps, pickle.loads, join)
+
+except ImportError:
+    pass
+
+
 def _default_backend(func, *args, **kwargs):
     # Utility to call a dask.dataframe function with
     # the default ("pandas") backend
diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py
index 5b37e6e825c..17650c9b70d 100644
--- a/python/dask_cudf/dask_cudf/core.py
+++ b/python/dask_cudf/dask_cudf/core.py
@@ -22,7 +22,7 @@
 
 import cudf
 from cudf import _lib as libcudf
-from cudf.utils.utils import _dask_cudf_nvtx_annotate
+from cudf.utils.nvtx_annotation import _dask_cudf_nvtx_annotate
 
 from dask_cudf import sorting
 from dask_cudf.accessors import ListMethods, StructMethods
diff --git a/python/dask_cudf/dask_cudf/groupby.py b/python/dask_cudf/dask_cudf/groupby.py
index f4bbcaf4dd1..b1fdf443a17 100644
--- a/python/dask_cudf/dask_cudf/groupby.py
+++ b/python/dask_cudf/dask_cudf/groupby.py
@@ -15,7 +15,7 @@
 from dask.utils import funcname
 
 import cudf
-from cudf.utils.utils import _dask_cudf_nvtx_annotate
+from cudf.utils.nvtx_annotation import _dask_cudf_nvtx_annotate
 
 # aggregations that are dask-cudf optimized
 OPTIMIZED_AGGS = (
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
index 85ec36cf2c5..7b4e20012f7 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
@@ -148,7 +148,6 @@ def test_roundtrip_from_pandas(tmpdir):
 
 
 def test_strings(tmpdir):
-
     fn = str(tmpdir)
     dfp = pd.DataFrame(
         {"a": ["aa", "bbb", "cccc"], "b": ["hello", "dog", "man"]}
@@ -161,7 +160,6 @@ def test_strings(tmpdir):
 
 
 def test_dask_timeseries_from_pandas(tmpdir):
-
     fn = str(tmpdir.join("test.parquet"))
     ddf2 = dask.datasets.timeseries(freq="D")
     pdf = ddf2.compute()
@@ -173,7 +171,6 @@ def test_dask_timeseries_from_pandas(tmpdir):
 @pytest.mark.parametrize("index", [False, None])
 @pytest.mark.parametrize("divisions", [False, True])
 def test_dask_timeseries_from_dask(tmpdir, index, divisions):
-
     fn = str(tmpdir)
     ddf2 = dask.datasets.timeseries(freq="D")
     ddf2.to_parquet(fn, engine="pyarrow", write_index=index)
@@ -188,7 +185,6 @@ def test_dask_timeseries_from_dask(tmpdir, index, divisions):
 @pytest.mark.parametrize("index", [False, None])
 @pytest.mark.parametrize("divisions", [False, True])
 def test_dask_timeseries_from_daskcudf(tmpdir, index, divisions):
-
     fn = str(tmpdir)
     ddf2 = dask_cudf.from_cudf(
         cudf.datasets.timeseries(freq="D"), npartitions=4
@@ -205,7 +201,6 @@ def test_dask_timeseries_from_daskcudf(tmpdir, index, divisions):
 
 @pytest.mark.parametrize("index", [False, True])
 def test_empty(tmpdir, index):
-
     fn = str(tmpdir)
     dfp = pd.DataFrame({"a": [11.0, 12.0, 12.0], "b": [4, 5, 6]})[:0]
     if index:
@@ -218,7 +213,6 @@ def test_empty(tmpdir, index):
 
 
 def test_filters(tmpdir):
-
     tmp_path = str(tmpdir)
     df = pd.DataFrame({"x": range(10), "y": list("aabbccddee")})
     ddf = dd.from_pandas(df, npartitions=5)
@@ -251,7 +245,6 @@ def test_filters(tmpdir):
 @pytest.mark.parametrize("numeric", [True, False])
 @pytest.mark.parametrize("null", [np.nan, None])
 def test_isna_filters(tmpdir, null, numeric):
-
     tmp_path = str(tmpdir)
     df = pd.DataFrame(
         {
@@ -284,7 +277,6 @@ def test_isna_filters(tmpdir, null, numeric):
 
 
 def test_filters_at_row_group_level(tmpdir):
-
     tmp_path = str(tmpdir)
     df = pd.DataFrame({"x": range(10), "y": list("aabbccddee")})
     ddf = dd.from_pandas(df, npartitions=5)
@@ -405,7 +397,6 @@ def test_split_row_groups(tmpdir, row_groups, index):
 @need_create_meta
 @pytest.mark.parametrize("partition_on", [None, "a"])
 def test_create_metadata_file(tmpdir, partition_on):
-
     tmpdir = str(tmpdir)
 
     # Write ddf without a _metadata file
@@ -445,7 +436,6 @@ def test_create_metadata_file(tmpdir, partition_on):
 
 @need_create_meta
 def test_create_metadata_file_inconsistent_schema(tmpdir):
-
     # NOTE: This test demonstrates that the CudfEngine
     # can be used to generate a global `_metadata` file
     # even if there are inconsistent schemas in the dataset.
diff --git a/python/dask_cudf/dask_cudf/sorting.py b/python/dask_cudf/dask_cudf/sorting.py
index d6c9c1be73c..27ba82c390c 100644
--- a/python/dask_cudf/dask_cudf/sorting.py
+++ b/python/dask_cudf/dask_cudf/sorting.py
@@ -16,7 +16,7 @@
 
 import cudf as gd
 from cudf.api.types import is_categorical_dtype
-from cudf.utils.utils import _dask_cudf_nvtx_annotate
+from cudf.utils.nvtx_annotation import _dask_cudf_nvtx_annotate
 
 _SHUFFLE_SUPPORT = ("tasks", "p2p")  # "disk" not supported
 
diff --git a/python/dask_cudf/dask_cudf/tests/test_sort.py b/python/dask_cudf/dask_cudf/tests/test_sort.py
index 94609b180d6..e58255cda06 100644
--- a/python/dask_cudf/dask_cudf/tests/test_sort.py
+++ b/python/dask_cudf/dask_cudf/tests/test_sort.py
@@ -114,3 +114,14 @@ def test_sort_values_empty_string(by):
     if "a" in by:
         expect = df.sort_values(by)
         assert dd.assert_eq(got, expect, check_index=False)
+
+
+def test_disk_shuffle():
+    try:
+        from dask.dataframe.dispatch import partd_encode_dispatch  # noqa: F401
+    except ImportError:
+        pytest.skip("need a version of dask that has partd_encode_dispatch")
+    df = cudf.DataFrame({"a": [1, 2, 3] * 20, "b": [4, 5, 6, 7] * 15})
+    ddf = dd.from_pandas(df, npartitions=4)
+    got = dd.DataFrame.shuffle(ddf, "a", shuffle="disk")
+    dd.assert_eq(got, df)
diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml
index 41b57b71749..0306da3de46 100644
--- a/python/dask_cudf/pyproject.toml
+++ b/python/dask_cudf/pyproject.toml
@@ -9,7 +9,7 @@ requires = [
 
 [project]
 name = "dask_cudf"
-version = "23.10.00"
+dynamic = ["version", "entry-points"]
 description = "Utilities for Dask and cuDF interactions"
 readme = { file = "README.md", content-type = "text/markdown" }
 authors = [
@@ -18,13 +18,12 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
-    "cudf==23.10.*",
+    "cudf==23.12.*",
     "cupy-cuda11x>=12.0.0",
-    "dask==2023.9.2",
-    "distributed==2023.9.2",
     "fsspec>=0.6.0",
     "numpy>=1.21,<1.25",
     "pandas>=1.3,<1.6.0dev0",
+    "rapids-dask-dependency==23.12.*",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",
@@ -35,11 +34,10 @@ classifiers = [
     "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
 ]
-dynamic = ["entry-points"]
 
 [project.optional-dependencies]
 test = [
-    "dask-cuda==23.10.*",
+    "dask-cuda==23.12.*",
     "numba>=0.57,<0.58",
     "pytest",
     "pytest-cov",
@@ -52,6 +50,9 @@ Homepage = "https://github.com/rapidsai/cudf"
 [tool.setuptools]
 license-files = ["LICENSE"]
 
+[tool.setuptools.dynamic]
+version = {file = "dask_cudf/VERSION"}
+
 [tool.isort]
 line_length = 79
 multi_line_output = 3
diff --git a/python/dask_cudf/setup.py b/python/dask_cudf/setup.py
index 3fa0f257834..c6ce219d32f 100644
--- a/python/dask_cudf/setup.py
+++ b/python/dask_cudf/setup.py
@@ -2,9 +2,12 @@
 
 from setuptools import find_packages, setup
 
+packages = find_packages(exclude=["tests", "tests.*"])
+
 setup(
     include_package_data=True,
-    packages=find_packages(exclude=["tests", "tests.*"]),
+    packages=packages,
+    package_data={key: ["VERSION"] for key in packages},
     entry_points={
         "dask.dataframe.backends": [
             "cudf = dask_cudf.backends:CudfBackendEntrypoint",